From 07b01efe5d24aaca65ad99dc3f97a88ca3b49fce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 14 Dec 2022 19:28:21 +0100 Subject: [PATCH 001/496] Use correct header guards --- cmake/Kokkos_Version_Info.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/Kokkos_Version_Info.hpp b/cmake/Kokkos_Version_Info.hpp index ba605a301d..831247115e 100644 --- a/cmake/Kokkos_Version_Info.hpp +++ b/cmake/Kokkos_Version_Info.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef GIT_VERSION_H -#define GIT_VERSION_H +#ifndef KOKKOS_GIT_VERSION_INFO_H +#define KOKKOS_GIT_VERSION_INFO_H #include From 2e09341003b0c6116145c9c93ff86ff61244dac8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 16 Dec 2022 20:31:22 +0100 Subject: [PATCH 002/496] Use separate .yml file for benchmarking --- .github/workflows/performance-benchmark.yml | 66 +++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 .github/workflows/performance-benchmark.yml diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml new file mode 100644 index 0000000000..cef26b3f02 --- /dev/null +++ b/.github/workflows/performance-benchmark.yml @@ -0,0 +1,66 @@ +name: github-benchmarks +on: + push: + branches: + - develop + - performance-results-visualization + +jobs: + CI: + continue-on-error: true + strategy: + matrix: + distro: ['ubuntu:latest'] + cxx: ['g++', 'clang++'] + backend: ['OPENMP'] + runs-on: ubuntu-latest + container: + image: ghcr.io/kokkos/ci-containers/${{ matrix.distro }} + # see https://github.com/actions/virtual-environments/issues/3812 + options: --security-opt seccomp=unconfined + env: + BUILD_ID: ${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - uses: actions/cache@v3 + with: + path: ~/.ccache + key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.openmp }}-${github.ref}-${{ github.sha }} + restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.openmp }}-${{github.ref}} + - name: Configure Kokkos + run: | + cmake -B builddir \ + -DCMAKE_INSTALL_PREFIX=/usr \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_HWLOC=ON \ + -DKokkos_ENABLE_${{ matrix.backend }}=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_EXAMPLES=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ + -DCMAKE_BUILD_TYPE=Release + - name: Build + run: | + ccache -z + cmake --build builddir --parallel 2 + ccache -s + - name: Tests + working-directory: builddir + run: ctest --output-on-failure + - name: Gather benchmark results + run: | + mkdir ${{ env.BUILD_ID }} + find builddir/core/perf_test/ -name "*.json" -exec mv {} ${{ env.BUILD_ID }}/ \; + - name: Push benchmark results + uses: dmnemec/copy_file_to_another_repo_action@main + env: + API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} + with: + source_file: ${{ env.BUILD_ID }} + destination_repo: 'kokkos/kokkos-benchmark-results' + destination_branch: 'master' + user_email: 'kokkos@users.noreply.github.com' + user_name: 'Kokkos Developers' From 92906bf38fb082f46e1444c8423ddf05dea8d282 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 20 Dec 2022 16:41:06 +0100 Subject: [PATCH 003/496] Remove security options --- .github/workflows/performance-benchmark.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index cef26b3f02..4326b9f7f1 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -16,8 +16,6 @@ jobs: runs-on: ubuntu-latest container: image: ghcr.io/kokkos/ci-containers/${{ matrix.distro }} - # see https://github.com/actions/virtual-environments/issues/3812 - options: --security-opt seccomp=unconfined env: BUILD_ID: ${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }} steps: From 67a92d3451df6f5fe73b9acb2ac7c5d646e7d5a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 20 Dec 2022 17:15:25 +0100 Subject: [PATCH 004/496] Do not build tests and examples --- .github/workflows/performance-benchmark.yml | 2 - core/CMakeLists.txt | 21 ++- core/perf_test/CMakeLists.txt | 147 ++++++++++---------- 3 files changed, 88 insertions(+), 82 deletions(-) diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 4326b9f7f1..312f836673 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -33,9 +33,7 @@ jobs: -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_HWLOC=ON \ -DKokkos_ENABLE_${{ matrix.backend }}=ON \ - -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ - -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 5d9fde56d2..b78eb05e26 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -6,10 +6,23 @@ IF (NOT Kokkos_INSTALL_TESTING) ADD_SUBDIRECTORY(src) ENDIF() +FUNCTION(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) + IF(NOT Kokkos_ENABLE_BENCHMARKS) + RETURN() + ENDIF() + + IF(KOKKOS_HAS_TRILINOS) + message( + STATUS + "Benchmarks are not supported when building as part of Trilinos" + ) + RETURN() + ENDIF() + + ADD_SUBDIRECTORY(${DIR_NAME}) +ENDFUNCTION() + KOKKOS_ADD_TEST_DIRECTORIES(unit_test) -IF (NOT KOKKOS_HAS_TRILINOS) - # We are using the githash etc in here, which does not work correct in Trilinos - KOKKOS_ADD_TEST_DIRECTORIES(perf_test) -ENDIF() +KOKKOS_ADD_BENCHMARK_DIRECTORY(perf_test) KOKKOS_SUBPACKAGE_POSTPROCESS() diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 814102975f..04b29cb2a4 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -15,102 +15,97 @@ IF ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILE RETURN() ENDIF() +IF(KOKKOS_ENABLE_TESTS) -SET(SOURCES - PerfTestMain.cpp - PerfTestGramSchmidt.cpp - PerfTestHexGrad.cpp - PerfTest_CustomReduction.cpp - PerfTest_ExecSpacePartitioning.cpp - PerfTest_ViewAllocate.cpp - PerfTest_ViewFill_123.cpp - PerfTest_ViewFill_45.cpp - PerfTest_ViewFill_6.cpp - PerfTest_ViewFill_7.cpp - PerfTest_ViewFill_8.cpp - PerfTest_ViewResize_123.cpp - PerfTest_ViewResize_45.cpp - PerfTest_ViewResize_6.cpp - PerfTest_ViewResize_7.cpp - PerfTest_ViewResize_8.cpp - ) - -IF(Kokkos_ENABLE_OPENMPTARGET) -# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction - LIST(REMOVE_ITEM SOURCES + SET(SOURCES + PerfTestMain.cpp PerfTestGramSchmidt.cpp + PerfTestHexGrad.cpp PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp - ) -ENDIF() - -IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) - KOKKOS_ADD_EXECUTABLE ( - PerformanceTest_SharedSpace - SOURCES test_sharedSpace.cpp - ) -ENDIF() - -# Per #374, we always want to build this test, but we only want to run -# it as a PERFORMANCE test. That's why we separate building the test -# from running the test. + PerfTest_ViewAllocate.cpp + PerfTest_ViewFill_123.cpp + PerfTest_ViewFill_45.cpp + PerfTest_ViewFill_6.cpp + PerfTest_ViewFill_7.cpp + PerfTest_ViewFill_8.cpp + PerfTest_ViewResize_123.cpp + PerfTest_ViewResize_45.cpp + PerfTest_ViewResize_6.cpp + PerfTest_ViewResize_7.cpp + PerfTest_ViewResize_8.cpp + ) -#leave these as basic includes for now -#I don't need anything transitive -KOKKOS_INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src") -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + IF(Kokkos_ENABLE_OPENMPTARGET) + # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction + LIST(REMOVE_ITEM SOURCES + PerfTestGramSchmidt.cpp + PerfTest_CustomReduction.cpp + PerfTest_ExecSpacePartitioning.cpp + ) + ENDIF() -# This test currently times out for MSVC -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerfTestExec - SOURCES ${SOURCES} - CATEGORIES PERFORMANCE - ) -ENDIF() + IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) + KOKKOS_ADD_EXECUTABLE ( + PerformanceTest_SharedSpace + SOURCES test_sharedSpace.cpp + ) + ENDIF() -KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_Atomic - SOURCES test_atomic.cpp - CATEGORIES PERFORMANCE -) + # Per #374, we always want to build this test, but we only want to run + # it as a PERFORMANCE test. That's why we separate building the test + # from running the test. + + #leave these as basic includes for now + #I don't need anything transitive + KOKKOS_INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src") + KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) + KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + + # This test currently times out for MSVC + IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerfTestExec + SOURCES ${SOURCES} + CATEGORIES PERFORMANCE + ) + ENDIF() -IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_Atomic_MinMax - SOURCES test_atomic_minmax_simple.cpp + PerformanceTest_Atomic + SOURCES test_atomic.cpp CATEGORIES PERFORMANCE ) -ENDIF() -# FIXME_NVHPC -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) -KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_Mempool - SOURCES test_mempool.cpp - CATEGORIES PERFORMANCE -) -ENDIF() + IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_Atomic_MinMax + SOURCES test_atomic_minmax_simple.cpp + CATEGORIES PERFORMANCE + ) + ENDIF() -IF(NOT Kokkos_ENABLE_OPENMPTARGET) -# FIXME OPENMPTARGET needs tasking + # FIXME_NVHPC + IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_TaskDag - SOURCES test_taskdag.cpp + PerformanceTest_Mempool + SOURCES test_mempool.cpp CATEGORIES PERFORMANCE ) -ENDIF() - + ENDIF() -IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() -ENDIF() + IF(NOT Kokkos_ENABLE_OPENMPTARGET) + # FIXME OPENMPTARGET needs tasking + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_TaskDag + SOURCES test_taskdag.cpp + CATEGORIES PERFORMANCE + ) + ENDIF() -IF (KOKKOS_HAS_TRILINOS) - message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") ENDIF() +# Find or download google/benchmark library find_package(benchmark QUIET) IF(benchmark_FOUND) MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") From 176ae8bbbfea2921fe8ef4a64cc2d8b55545e547 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 20 Dec 2022 19:11:33 +0100 Subject: [PATCH 005/496] Use double quotes instead of include --- core/perf_test/BenchmarkMain.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_Raw.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_a123.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_a45.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_a6.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_a7.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_a8.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_b123.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_b45.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_b6.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_b7.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_b8.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_c123.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_c45.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_c6.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_c7.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_c8.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_d123.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_d45.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_d6.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_d7.cpp | 2 +- core/perf_test/PerfTest_ViewCopy_d8.cpp | 2 +- 22 files changed, 22 insertions(+), 22 deletions(-) diff --git a/core/perf_test/BenchmarkMain.cpp b/core/perf_test/BenchmarkMain.cpp index bba2bca36d..c28eb80e1b 100644 --- a/core/perf_test/BenchmarkMain.cpp +++ b/core/perf_test/BenchmarkMain.cpp @@ -16,7 +16,7 @@ #include -#include +#include "Benchmark_Context.hpp" #include int main(int argc, char** argv) { diff --git a/core/perf_test/PerfTest_ViewCopy_Raw.cpp b/core/perf_test/PerfTest_ViewCopy_Raw.cpp index 976f800487..67a8d7e555 100644 --- a/core/perf_test/PerfTest_ViewCopy_Raw.cpp +++ b/core/perf_test/PerfTest_ViewCopy_Raw.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_a123.cpp b/core/perf_test/PerfTest_ViewCopy_a123.cpp index 3fc1c2480c..db33d11809 100644 --- a/core/perf_test/PerfTest_ViewCopy_a123.cpp +++ b/core/perf_test/PerfTest_ViewCopy_a123.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_a45.cpp b/core/perf_test/PerfTest_ViewCopy_a45.cpp index 542f5534be..3200602566 100644 --- a/core/perf_test/PerfTest_ViewCopy_a45.cpp +++ b/core/perf_test/PerfTest_ViewCopy_a45.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_a6.cpp b/core/perf_test/PerfTest_ViewCopy_a6.cpp index 782628072d..0855299aad 100644 --- a/core/perf_test/PerfTest_ViewCopy_a6.cpp +++ b/core/perf_test/PerfTest_ViewCopy_a6.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_a7.cpp b/core/perf_test/PerfTest_ViewCopy_a7.cpp index 000c8b401c..36577ef2ef 100644 --- a/core/perf_test/PerfTest_ViewCopy_a7.cpp +++ b/core/perf_test/PerfTest_ViewCopy_a7.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_a8.cpp b/core/perf_test/PerfTest_ViewCopy_a8.cpp index f7d7c6040a..c449d684f1 100644 --- a/core/perf_test/PerfTest_ViewCopy_a8.cpp +++ b/core/perf_test/PerfTest_ViewCopy_a8.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_b123.cpp b/core/perf_test/PerfTest_ViewCopy_b123.cpp index 7820e89973..8675f427d7 100644 --- a/core/perf_test/PerfTest_ViewCopy_b123.cpp +++ b/core/perf_test/PerfTest_ViewCopy_b123.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_b45.cpp b/core/perf_test/PerfTest_ViewCopy_b45.cpp index 58f58314b0..93522fcf0d 100644 --- a/core/perf_test/PerfTest_ViewCopy_b45.cpp +++ b/core/perf_test/PerfTest_ViewCopy_b45.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_b6.cpp b/core/perf_test/PerfTest_ViewCopy_b6.cpp index 920bf1222f..be95c7cab3 100644 --- a/core/perf_test/PerfTest_ViewCopy_b6.cpp +++ b/core/perf_test/PerfTest_ViewCopy_b6.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_b7.cpp b/core/perf_test/PerfTest_ViewCopy_b7.cpp index 11b7a0d0d1..f8eee75ce7 100644 --- a/core/perf_test/PerfTest_ViewCopy_b7.cpp +++ b/core/perf_test/PerfTest_ViewCopy_b7.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_b8.cpp b/core/perf_test/PerfTest_ViewCopy_b8.cpp index 56a3d8d4c3..01dda2a33f 100644 --- a/core/perf_test/PerfTest_ViewCopy_b8.cpp +++ b/core/perf_test/PerfTest_ViewCopy_b8.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_c123.cpp b/core/perf_test/PerfTest_ViewCopy_c123.cpp index 375f7c8985..25e8747474 100644 --- a/core/perf_test/PerfTest_ViewCopy_c123.cpp +++ b/core/perf_test/PerfTest_ViewCopy_c123.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_c45.cpp b/core/perf_test/PerfTest_ViewCopy_c45.cpp index 3625631617..b1f4a7b577 100644 --- a/core/perf_test/PerfTest_ViewCopy_c45.cpp +++ b/core/perf_test/PerfTest_ViewCopy_c45.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_c6.cpp b/core/perf_test/PerfTest_ViewCopy_c6.cpp index bcb70b6764..8120664792 100644 --- a/core/perf_test/PerfTest_ViewCopy_c6.cpp +++ b/core/perf_test/PerfTest_ViewCopy_c6.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_c7.cpp b/core/perf_test/PerfTest_ViewCopy_c7.cpp index 055d0e344c..cee9f5bd01 100644 --- a/core/perf_test/PerfTest_ViewCopy_c7.cpp +++ b/core/perf_test/PerfTest_ViewCopy_c7.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_c8.cpp b/core/perf_test/PerfTest_ViewCopy_c8.cpp index 1e5342ef52..6f204a4222 100644 --- a/core/perf_test/PerfTest_ViewCopy_c8.cpp +++ b/core/perf_test/PerfTest_ViewCopy_c8.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_d123.cpp b/core/perf_test/PerfTest_ViewCopy_d123.cpp index d61e01f9f6..6d72bea490 100644 --- a/core/perf_test/PerfTest_ViewCopy_d123.cpp +++ b/core/perf_test/PerfTest_ViewCopy_d123.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_d45.cpp b/core/perf_test/PerfTest_ViewCopy_d45.cpp index 385d5b48ae..1a407cd648 100644 --- a/core/perf_test/PerfTest_ViewCopy_d45.cpp +++ b/core/perf_test/PerfTest_ViewCopy_d45.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_d6.cpp b/core/perf_test/PerfTest_ViewCopy_d6.cpp index 0ae16012d6..27b1a816fc 100644 --- a/core/perf_test/PerfTest_ViewCopy_d6.cpp +++ b/core/perf_test/PerfTest_ViewCopy_d6.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_d7.cpp b/core/perf_test/PerfTest_ViewCopy_d7.cpp index 4ebbb6359b..17d4bf2077 100644 --- a/core/perf_test/PerfTest_ViewCopy_d7.cpp +++ b/core/perf_test/PerfTest_ViewCopy_d7.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewCopy_d8.cpp b/core/perf_test/PerfTest_ViewCopy_d8.cpp index 3a888b6155..7bd0263228 100644 --- a/core/perf_test/PerfTest_ViewCopy_d8.cpp +++ b/core/perf_test/PerfTest_ViewCopy_d8.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewCopy.hpp" namespace Test { From 3a1769b6bc76c6b4172e12459ec62750f80359d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 20 Dec 2022 19:33:16 +0100 Subject: [PATCH 006/496] Build on pull request --- .github/workflows/performance-benchmark.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 312f836673..16a724b620 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -3,7 +3,7 @@ on: push: branches: - develop - - performance-results-visualization + pull_request: jobs: CI: @@ -51,6 +51,7 @@ jobs: mkdir ${{ env.BUILD_ID }} find builddir/core/perf_test/ -name "*.json" -exec mv {} ${{ env.BUILD_ID }}/ \; - name: Push benchmark results + if: ${{ github.ref == 'refs/heads/develop' }} uses: dmnemec/copy_file_to_another_repo_action@main env: API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} From 327aac579f87361b265801c70a990d42f570af92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 21 Dec 2022 18:23:29 +0100 Subject: [PATCH 007/496] Add comment for PerformanceTest_* executables --- core/perf_test/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 04b29cb2a4..7832f1fd9f 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -15,6 +15,8 @@ IF ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILE RETURN() ENDIF() +# all PerformanceTest_* executables are part of regular tests +# TODO: finish converting these into benchmarks (in progress) IF(KOKKOS_ENABLE_TESTS) SET(SOURCES From 604dc86c0f81321a6ca4965121dcb8a284167036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 21 Dec 2022 18:25:03 +0100 Subject: [PATCH 008/496] Remove commented out code --- core/perf_test/CMakeLists.txt | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 7832f1fd9f..5df5e389ee 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -1,14 +1,3 @@ - -#INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -#INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) - - -# warning: PerfTest_CustomReduction.cpp uses -# ../../algorithms/src/Kokkos_Random.hpp -# we'll just allow it to be included, but note -# that in TriBITS KokkosAlgorithms can be disabled... -#INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src") - # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. # FIXME_OPENACC - temporarily disabled due to unimplemented features IF ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) @@ -60,6 +49,10 @@ IF(KOKKOS_ENABLE_TESTS) #leave these as basic includes for now #I don't need anything transitive + # warning: PerfTest_CustomReduction.cpp uses + # ../../algorithms/src/Kokkos_Random.hpp + # we'll just allow it to be included, but note + # that in TriBITS KokkosAlgorithms can be disabled... KOKKOS_INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src") KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) From c6fae3fd586378180cb5233fd6e7ca69fac552ce Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 23 Dec 2022 17:42:26 -0500 Subject: [PATCH 009/496] Move definitions of `OpenACCIterate{Left,Right}` and `OpenACCMDRange{Begin,End,Tile}` --- core/src/OpenACC/Kokkos_OpenACC_MDRangePolicy.hpp | 15 +++++++++++++++ .../Kokkos_OpenACC_ParallelFor_MDRange.hpp | 11 ----------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACC_MDRangePolicy.hpp b/core/src/OpenACC/Kokkos_OpenACC_MDRangePolicy.hpp index 4525f37a61..9c58dd6fa6 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_MDRangePolicy.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_MDRangePolicy.hpp @@ -42,4 +42,19 @@ struct ThreadAndVectorNestLevel; +using OpenACCIterateRight = std::integral_constant; +template +using OpenACCMDRangeBegin = decltype(MDRangePolicy>::m_lower); +template +using OpenACCMDRangeEnd = decltype(MDRangePolicy>::m_upper); +template +using OpenACCMDRangeTile = decltype(MDRangePolicy>::m_tile); + +} // namespace Kokkos::Experimental::Impl + #endif diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp index ac219527c6..a55a18bc24 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp @@ -24,17 +24,6 @@ namespace Kokkos::Experimental::Impl { -struct OpenACCCollapse {}; -struct OpenACCTile {}; -using OpenACCIterateLeft = std::integral_constant; -using OpenACCIterateRight = std::integral_constant; -template -using OpenACCMDRangeBegin = decltype(MDRangePolicy>::m_lower); -template -using OpenACCMDRangeEnd = decltype(MDRangePolicy>::m_upper); -template -using OpenACCMDRangeTile = decltype(MDRangePolicy>::m_tile); - template void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, Functor const& functor, From 901862190b1fef29bb9eb039d3164b2c2cc9938c Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 23 Dec 2022 17:43:08 -0500 Subject: [PATCH 010/496] Initial implementation of MDRange parallel_reduce --- .../Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 458 ++++++++++++++++++ core/src/decl/Kokkos_Declare_OPENACC.hpp | 1 + 2 files changed, 459 insertions(+) create mode 100644 core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp new file mode 100644 index 0000000000..31f37420a2 --- /dev/null +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -0,0 +1,458 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENACC_PARALLEL_REDUCE_MDRANGE_HPP +#define KOKKOS_OPENACC_PARALLEL_REDUCE_MDRANGE_HPP + +#include +#include +#include +#include +#include + +namespace Kokkos::Experimental::Impl { + +// primary template: catch-all non-implemented custom reducers +template > +struct OpenACCParallelReduceMDRangeHelper { + OpenACCParallelReduceMDRangeHelper(Functor const&, Reducer const&, + Policy const&) { + static_assert(!Kokkos::Impl::always_true::value, + "not implemented"); + } +}; +} // namespace Kokkos::Experimental::Impl + +template +class Kokkos::Impl::ParallelReduce, + ReducerType, Kokkos::Experimental::OpenACC> { + using Policy = MDRangePolicy; + + using ReducerConditional = + if_c, Functor, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using Analysis = + FunctorAnalysis; + + using Pointer = typename Analysis::pointer_type; + using ValueType = typename Analysis::value_type; + + Functor m_functor; + Policy m_policy; + ReducerType m_reducer; + Pointer m_result_ptr; + + public: + ParallelReduce(Functor const& functor, Policy const& policy, + ReducerType const& reducer) + : m_functor(functor), + m_policy(policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()) {} + + template + ParallelReduce( + const Functor& functor, const Policy& policy, const ViewType& result, + std::enable_if_t::value, void*> = nullptr) + : m_functor(functor), + m_policy(policy), + m_reducer(InvalidType()), + m_result_ptr(result.data()) {} + + void execute() const { + static_assert(1 < Policy::rank && Policy::rank < 7); + static_assert(Policy::inner_direction == Iterate::Left || + Policy::inner_direction == Iterate::Right); + constexpr int rank = Policy::rank; + for (int i = 0; i < rank; ++i) { + if (m_policy.m_lower[i] >= m_policy.m_upper[i]) { + return; + } + } + + ValueType val; + typename Analysis::Reducer final_reducer( + &ReducerConditional::select(m_functor, m_reducer)); + final_reducer.init(&val); + + Kokkos::Experimental::Impl::OpenACCParallelReduceMDRangeHelper( + Kokkos::Experimental::Impl::FunctorAdapter(m_functor), + std::conditional_t, ReducerType, + Sum>(val), + m_policy); + + *m_result_ptr = val; + } +}; + +#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE(REDUCER, \ + OPERATOR) \ + namespace Kokkos::Experimental::Impl { \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + int begin1 = begin[1]; \ + int end1 = end[1]; \ + int begin0 = begin[0]; \ + int end0 = end[0]; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(2) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (auto i1 = begin1; i1 < end1; ++i1) { \ + for (auto i0 = begin0; i0 < end0; ++i0) { \ + functor(i0, i1, val); \ + } \ + } \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + int begin0 = begin[0]; \ + int end0 = end[0]; \ + int begin1 = begin[1]; \ + int end1 = end[1]; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PGRAMA(parallel loop gang vector collapse(2) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (auto i0 = begin0; i0 < end0; ++i0) { \ + for (auto i1 = begin1; i1 < end1; ++i1) { \ + functor(i0, i1, val); \ + } \ + } \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + int begin2 = begin[2]; \ + int end2 = end[2]; \ + int begin1 = begin[1]; \ + int end1 = end[1]; \ + int begin0 = begin[0]; \ + int end0 = end[0]; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(3) reduction( \ + OPERATOR \ + : val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (auto i2 = begin2; i2 < end2; ++i2) { \ + for (auto i1 = begin1; i1 < end1; ++i1) { \ + for (auto i0 = begin0; i0 < end0; ++i0) { \ + functor(i0, i1, i2, val); \ + } \ + } \ + } \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + int begin0 = begin[0]; \ + int end0 = end[0]; \ + int begin1 = begin[1]; \ + int end1 = end[1]; \ + int begin2 = begin[2]; \ + int end2 = end[2]; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PGRAMA(parallel loop gang vector collapse(3) reduction( \ + OPERATOR \ + : val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (auto i0 = begin0; i0 < end0; ++i0) { \ + for (auto i1 = begin1; i1 < end1; ++i1) { \ + for (auto i2 = begin2; i2 < end2; ++i2) { \ + functor(i0, i1, i2, val); \ + } \ + } \ + } \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + int begin3 = begin[3]; \ + int end3 = end[3]; \ + int begin2 = begin[2]; \ + int end2 = end[2]; \ + int begin1 = begin[1]; \ + int end1 = end[1]; \ + int begin0 = begin[0]; \ + int end0 = end[0]; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(4) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (auto i3 = begin3; i3 < end3; ++i3) { \ + for (auto i2 = begin2; i2 < end2; ++i2) { \ + for (auto i1 = begin1; i1 < end1; ++i1) { \ + for (auto i0 = begin0; i0 < end0; ++i0) { \ + functor(i0, i1, i2, i3, val); \ + } \ + } \ + } \ + } \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + int begin0 = begin[0]; \ + int end0 = end[0]; \ + int begin1 = begin[1]; \ + int end1 = end[1]; \ + int begin2 = begin[2]; \ + int end2 = end[2]; \ + int begin3 = begin[3]; \ + int end3 = end[3]; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PGRAMA(parallel loop gang vector collapse(4) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (auto i0 = begin0; i0 < end0; ++i0) { \ + for (auto i1 = begin1; i1 < end1; ++i1) { \ + for (auto i2 = begin2; i2 < end2; ++i2) { \ + for (auto i3 = begin3; i3 < end3; ++i3) { \ + functor(i0, i1, i2, i3, val); \ + } \ + } \ + } \ + } \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + int begin4 = begin[4]; \ + int end4 = end[4]; \ + int begin3 = begin[3]; \ + int end3 = end[3]; \ + int begin2 = begin[2]; \ + int end2 = end[2]; \ + int begin1 = begin[1]; \ + int end1 = end[1]; \ + int begin0 = begin[0]; \ + int end0 = end[0]; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(5) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (auto i4 = begin4; i4 < end4; ++i4) { \ + for (auto i3 = begin3; i3 < end3; ++i3) { \ + for (auto i2 = begin2; i2 < end2; ++i2) { \ + for (auto i1 = begin1; i1 < end1; ++i1) { \ + for (auto i0 = begin0; i0 < end0; ++i0) { \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + } \ + } \ + } \ + } \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + int begin0 = begin[0]; \ + int end0 = end[0]; \ + int begin1 = begin[1]; \ + int end1 = end[1]; \ + int begin2 = begin[2]; \ + int end2 = end[2]; \ + int begin3 = begin[3]; \ + int end3 = end[3]; \ + int begin4 = begin[4]; \ + int end4 = end[4]; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PGRAMA(parallel loop gang vector collapse(5) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (auto i0 = begin0; i0 < end0; ++i0) { \ + for (auto i1 = begin1; i1 < end1; ++i1) { \ + for (auto i2 = begin2; i2 < end2; ++i2) { \ + for (auto i3 = begin3; i3 < end3; ++i3) { \ + for (auto i4 = begin4; i4 < end4; ++i4) { \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + } \ + } \ + } \ + } \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + int begin5 = begin[5]; \ + int end5 = end[5]; \ + int begin4 = begin[4]; \ + int end4 = end[4]; \ + int begin3 = begin[3]; \ + int end3 = end[3]; \ + int begin2 = begin[2]; \ + int end2 = end[2]; \ + int begin1 = begin[1]; \ + int end1 = end[1]; \ + int begin0 = begin[0]; \ + int end0 = end[0]; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(6) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (auto i5 = begin5; i5 < end5; ++i5) { \ + for (auto i4 = begin4; i4 < end4; ++i4) { \ + for (auto i3 = begin3; i3 < end3; ++i3) { \ + for (auto i2 = begin2; i2 < end2; ++i2) { \ + for (auto i1 = begin1; i1 < end1; ++i1) { \ + for (auto i0 = begin0; i0 < end0; ++i0) { \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + } \ + } \ + } \ + } \ + } \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + int begin0 = begin[0]; \ + int end0 = end[0]; \ + int begin1 = begin[1]; \ + int end1 = end[1]; \ + int begin2 = begin[2]; \ + int end2 = end[2]; \ + int begin3 = begin[3]; \ + int end3 = end[3]; \ + int begin4 = begin[4]; \ + int end4 = end[4]; \ + int begin5 = begin[5]; \ + int end5 = end[5]; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PGRAMA(parallel loop gang vector collapse(6) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (auto i0 = begin0; i0 < end0; ++i0) { \ + for (auto i1 = begin1; i1 < end1; ++i1) { \ + for (auto i2 = begin2; i2 < end2; ++i2) { \ + for (auto i3 = begin3; i3 < end3; ++i3) { \ + for (auto i4 = begin4; i4 < end4; ++i4) { \ + for (auto i5 = begin5; i5 < end5; ++i5) { \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + } \ + } \ + } \ + } \ + } \ + aval = val; \ + } \ + } // namespace Kokkos::Experimental::Impl + +#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(REDUCER, OPERATOR) \ + KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE(REDUCER, OPERATOR) \ + template \ + struct Kokkos::Experimental::Impl::OpenACCParallelReduceMDRangeHelper< \ + Functor, Kokkos::REDUCER, \ + Kokkos::MDRangePolicy, true> { \ + using Policy = MDRangePolicy; \ + using Reducer = REDUCER; \ + using ValueType = typename Reducer::value_type; \ + \ + OpenACCParallelReduceMDRangeHelper(Functor const& functor, \ + Reducer const& reducer, \ + Policy const& policy) { \ + ValueType val; \ + reducer.init(val); \ + \ + int const async_arg = policy.space().acc_async_queue(); \ + \ + OpenACCParallelReduce##REDUCER( \ + std::integral_constant(), val, \ + functor, policy.m_lower, policy.m_upper, async_arg); \ + \ + reducer.reference() = val; \ + } \ + } + +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(Sum, +); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(Prod, *); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(Min, min); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(Max, max); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(LAnd, &&); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(LOr, ||); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(BAnd, &); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(BOr, |); + +#undef KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER +#undef KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE + +#endif diff --git a/core/src/decl/Kokkos_Declare_OPENACC.hpp b/core/src/decl/Kokkos_Declare_OPENACC.hpp index 137286c741..40c29104bf 100644 --- a/core/src/decl/Kokkos_Declare_OPENACC.hpp +++ b/core/src/decl/Kokkos_Declare_OPENACC.hpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #endif From 9fbd78a01ee0c0602dfd0c05f1e9a4d52a643f90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 28 Dec 2022 17:38:26 +0100 Subject: [PATCH 011/496] Configure `ccache` correctly --- .github/workflows/continuous-integration-workflow.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 55b8817948..6049953e55 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -93,6 +93,7 @@ jobs: run: | cmake -B builddir \ -DCMAKE_INSTALL_PREFIX=/usr \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ ${{ matrix.clang-tidy }} \ -Ddesul_ROOT=/usr/desul-install/ \ -DKokkos_ARCH_NATIVE=ON \ From d17945316fed4e0158112ba4d76d7f79ef7950c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 28 Dec 2022 17:39:24 +0100 Subject: [PATCH 012/496] Use correct branch for destination repo --- .github/workflows/performance-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 16a724b620..4d98da3111 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -58,6 +58,6 @@ jobs: with: source_file: ${{ env.BUILD_ID }} destination_repo: 'kokkos/kokkos-benchmark-results' - destination_branch: 'master' + destination_branch: 'main' user_email: 'kokkos@users.noreply.github.com' user_name: 'Kokkos Developers' From 9fd7187b5baea30610ef00d9a43cc0698727a2cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 28 Dec 2022 17:40:25 +0100 Subject: [PATCH 013/496] Use correct GitHub access token --- .github/workflows/performance-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 4d98da3111..6fb345d576 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -54,7 +54,7 @@ jobs: if: ${{ github.ref == 'refs/heads/develop' }} uses: dmnemec/copy_file_to_another_repo_action@main env: - API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} + API_TOKEN_GITHUB: ${{ secrets.DALG24_PUSH_BENCHMARK_RESULTS }} with: source_file: ${{ env.BUILD_ID }} destination_repo: 'kokkos/kokkos-benchmark-results' From 64d9b44a1470ac6907b63346a58b05533a63d95a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 28 Dec 2022 17:51:50 +0100 Subject: [PATCH 014/496] Use maximum available level of build parallelism --- .github/workflows/performance-benchmark.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 6fb345d576..6313efeb29 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -41,7 +41,8 @@ jobs: - name: Build run: | ccache -z - cmake --build builddir --parallel 2 + NUM_CPU=$(grep -c processor /proc/cpuinfo) + cmake --build builddir --parallel ${NUM_CPU} ccache -s - name: Tests working-directory: builddir From 1134a1fb57b9dd5c302886b5e2b4d9e47dc8cce9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 28 Dec 2022 17:53:39 +0100 Subject: [PATCH 015/496] Simplify Kokkos configuration - do not enable deprecated code - do not configure CMAKE_INSTALL_PREFIX --- .github/workflows/performance-benchmark.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 6313efeb29..07bbc6419a 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -29,13 +29,10 @@ jobs: - name: Configure Kokkos run: | cmake -B builddir \ - -DCMAKE_INSTALL_PREFIX=/usr \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_HWLOC=ON \ -DKokkos_ENABLE_${{ matrix.backend }}=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ -DCMAKE_BUILD_TYPE=Release - name: Build From ef7fd60146567721453abaea0201879180db12b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 28 Dec 2022 18:41:54 +0100 Subject: [PATCH 016/496] Configure `ccache` for benchmark builds --- .github/workflows/performance-benchmark.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 07bbc6419a..d65891ab64 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -29,6 +29,7 @@ jobs: - name: Configure Kokkos run: | cmake -B builddir \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_HWLOC=ON \ -DKokkos_ENABLE_${{ matrix.backend }}=ON \ From 7e651ca70ba3c9df2c89da71e54a8fd5c9124309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 28 Dec 2022 23:05:34 +0100 Subject: [PATCH 017/496] Group similar options together --- .github/workflows/continuous-integration-workflow.yml | 2 +- .github/workflows/performance-benchmark.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 6049953e55..7f88532209 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -93,7 +93,6 @@ jobs: run: | cmake -B builddir \ -DCMAKE_INSTALL_PREFIX=/usr \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ ${{ matrix.clang-tidy }} \ -Ddesul_ROOT=/usr/desul-install/ \ -DKokkos_ARCH_NATIVE=ON \ @@ -106,6 +105,7 @@ jobs: -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} - name: Build run: | diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index d65891ab64..6d7a0faa6b 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -29,12 +29,12 @@ jobs: - name: Configure Kokkos run: | cmake -B builddir \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_HWLOC=ON \ -DKokkos_ENABLE_${{ matrix.backend }}=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_BUILD_TYPE=Release - name: Build run: | From a23580e8dd218fd2da89dd0f4938519c1d1255ff Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Wed, 4 Jan 2023 13:56:54 -0500 Subject: [PATCH 018/496] Temporarily disable unsupported reduction tests in core/unit_test/incremental/Test14_MDRangeReduce.hpp for the OpenACC backend. --- .../unit_test/incremental/Test14_MDRangeReduce.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/core/unit_test/incremental/Test14_MDRangeReduce.hpp b/core/unit_test/incremental/Test14_MDRangeReduce.hpp index a1f307c0c8..deffe88313 100644 --- a/core/unit_test/incremental/Test14_MDRangeReduce.hpp +++ b/core/unit_test/incremental/Test14_MDRangeReduce.hpp @@ -93,6 +93,8 @@ struct TestMDRangeReduce { }, d_result); +// FIXME_OPENACC: scalar reduction variable on the device is not yet supported. +#if !defined(KOKKOS_ENABLE_OPENACC) // Parallel reduce on a view. Kokkos::parallel_reduce( mdPolicy_2D, @@ -100,16 +102,23 @@ struct TestMDRangeReduce { update_value += d_data(i, j); }, d_resultView); +#endif // Check correctness. ASSERT_EQ(h_result, d_result); +// FIXME_OPENACC: scalar reduction variable on the device is not yet supported. +#if !defined(KOKKOS_ENABLE_OPENACC) // Copy view back to host. value_type view_result = 0.0; Kokkos::deep_copy(view_result, d_resultView); ASSERT_EQ(h_result, view_result); +#endif } +// FIXME_OPENACC: custom reductions are not yet supported in the +// OpenACC backend. +#if !defined(KOKKOS_ENABLE_OPENACC) // Custom Reduction void reduce_custom() { Complex_View_1D d_data("complex array", N); @@ -136,6 +145,7 @@ struct TestMDRangeReduce { ASSERT_EQ(result._re, sum * 0.5); ASSERT_EQ(result._im, -sum * 0.5); } +#endif }; // Reductions tests for MDRange policy and customized reduction. @@ -144,9 +154,13 @@ TEST(TEST_CATEGORY, incr_14_MDrangeReduce) { test.reduce_MDRange(); // FIXME_OPENMPTARGET: custom reductions are not yet supported in the // OpenMPTarget backend. +// FIXME_OPENACC: custom reductions are not yet supported in the +// OpenACC backend. #if !defined(KOKKOS_ENABLE_OPENMPTARGET) +#if !defined(KOKKOS_ENABLE_OPENACC) test.reduce_custom(); #endif +#endif } } // namespace Test From d34c751369e1cf5445162c6c51f3fee93a5ab4de Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 6 Jan 2023 08:00:16 -0500 Subject: [PATCH 019/496] Drop pre CUDA 11 macro guards in occupancy calculation --- .../Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp index 75a270cea3..401594e303 100644 --- a/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +++ b/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -51,24 +51,7 @@ inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties, : max_blocks_regs); // Limits due to blocks/SM -#if CUDA_VERSION >= 11000 int const max_blocks_per_sm = properties.maxBlocksPerMultiProcessor; -#else - int const max_blocks_per_sm = [&properties]() { - switch (properties.major) { - case 3: return 16; - case 5: - case 6: return 32; - case 7: { - int isTuring = properties.minor == 5; - return (isTuring) ? 16 : 32; - } - default: - throw_runtime_exception("Unknown device in cuda block size deduction"); - return 0; - } - }(); -#endif // Overall occupancy in blocks return std::min({max_blocks_regs, max_blocks_shmem, max_blocks_per_sm}); From 1fd8589618704ed836418976c07a7788c8016558 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 6 Jan 2023 08:00:53 -0500 Subject: [PATCH 020/496] Drop now unsused `get_shmem_per_sm_prefer_l1` function --- .../Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp | 35 ------------------- 1 file changed, 35 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp index 401594e303..cacf014c1d 100644 --- a/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +++ b/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -162,41 +162,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, LaunchBounds{}); } -// NOTE these number can be obtained several ways: -// * One option is to download the CUDA Occupancy Calculator spreadsheet, select -// "Compute Capability" first and check what is the smallest "Shared Memory -// Size Config" that is available. The "Shared Memory Per Multiprocessor" in -// bytes is then to be found below in the summary. -// * Another option would be to look for the information in the "Tuning -// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in -// the "Shared Memory" section (more tedious) -inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { - int const compute_capability = properties.major * 10 + properties.minor; - return [compute_capability]() { - switch (compute_capability) { - case 30: - case 32: - case 35: return 16; - case 37: return 80; - case 50: - case 53: - case 60: - case 62: return 64; - case 52: - case 61: return 96; - case 70: - case 80: - case 86: - case 90: return 8; - case 75: return 32; - default: - Kokkos::Impl::throw_runtime_exception( - "Unknown device in cuda block size deduction"); - } - return 0; - }() * 1024; -} - } // namespace Impl } // namespace Kokkos From d4bd01277658239aab3f671067981d26042ae873 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 6 Jan 2023 08:25:57 -0500 Subject: [PATCH 021/496] Revert "Drop pre CUDA 11 macro guards in occupancy calculation" This reverts commit d34c751369e1cf5445162c6c51f3fee93a5ab4de. --- .../Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp index cacf014c1d..a471cd380c 100644 --- a/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +++ b/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -51,7 +51,24 @@ inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties, : max_blocks_regs); // Limits due to blocks/SM +#if CUDA_VERSION >= 11000 int const max_blocks_per_sm = properties.maxBlocksPerMultiProcessor; +#else + int const max_blocks_per_sm = [&properties]() { + switch (properties.major) { + case 3: return 16; + case 5: + case 6: return 32; + case 7: { + int isTuring = properties.minor == 5; + return (isTuring) ? 16 : 32; + } + default: + throw_runtime_exception("Unknown device in cuda block size deduction"); + return 0; + } + }(); +#endif // Overall occupancy in blocks return std::min({max_blocks_regs, max_blocks_shmem, max_blocks_per_sm}); From 1f4468bdac5277a2b86ddab15577172de0a7b7f9 Mon Sep 17 00:00:00 2001 From: Todd Kordenbrock Date: Thu, 5 Jan 2023 17:28:11 -0600 Subject: [PATCH 022/496] fix src/dst Properties in deep_copy(DynamicView,View) In deep_copy(DynamicView, View), fix the dst Properties (DP) and the src Properties (SP) that were swapped in the using statements defining dst_type and src_type. Add a reproducer unit test. --- containers/src/Kokkos_DynamicView.hpp | 4 +- containers/unit_tests/TestDynamicView.hpp | 71 +++++++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index 9d4398c6b8..f8636b6212 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -878,8 +878,8 @@ inline void deep_copy(const View& dst, template inline void deep_copy(const Kokkos::Experimental::DynamicView& dst, const View& src) { - using dst_type = Kokkos::Experimental::DynamicView; - using src_type = View; + using dst_type = Kokkos::Experimental::DynamicView; + using src_type = View; using dst_execution_space = typename ViewTraits::execution_space; using src_memory_space = typename ViewTraits::memory_space; diff --git a/containers/unit_tests/TestDynamicView.hpp b/containers/unit_tests/TestDynamicView.hpp index 73da7684d1..a71b66de8c 100644 --- a/containers/unit_tests/TestDynamicView.hpp +++ b/containers/unit_tests/TestDynamicView.hpp @@ -212,6 +212,77 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); #endif } // end scope + + // Test: Reproducer to demonstrate compile-time error of deep_copy + // of DynamicView to/from on-host View. + // Case 4: + { + using device_view_type = Kokkos::View; + using host_view_type = typename Kokkos::View::HostMirror; + + view_type device_dynamic_view("on-device DynamicView", 1024, arg_total_size); + device_view_type device_view("on-device View", arg_total_size); + host_view_type host_view("on-host View", arg_total_size); + + // Use parallel_for to populate device_dynamic_view and verify values +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_for( + Kokkos::RangePolicy(0, arg_total_size), + KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(i); }); + + value_type result_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, arg_total_size), + KOKKOS_LAMBDA(const int i, value_type& partial_sum) { + partial_sum += (value_type)device_dynamic_view(i); + }, + result_sum); + + ASSERT_EQ(result_sum, (value_type)(arg_total_size * (arg_total_size - 1) / 2)); +#endif + + // Use an on-device View as intermediate to deep_copy the + // device_dynamic_view to host, zero out the device_dynamic_view, + // deep_copy from host back to the device_dynamic_view and verify + Kokkos::deep_copy(device_view, device_dynamic_view); + Kokkos::deep_copy(host_view, device_view); + Kokkos::deep_copy(device_view, host_view); +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_for( + Kokkos::RangePolicy(0, arg_total_size), + KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(0); }); +#endif + Kokkos::deep_copy(device_dynamic_view, device_view); +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + value_type new_result_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, arg_total_size), + KOKKOS_LAMBDA(const int i, value_type& partial_sum) { + partial_sum += (value_type)device_dynamic_view(i); + }, + new_result_sum); + + ASSERT_EQ(new_result_sum, (value_type)(arg_total_size * (arg_total_size - 1) / 2)); +#endif + + // Try to deep_copy device_dynamic_view directly to/from host. + // host-to-device currently fails to compile because DP and SP are + // swapped in the deep_copy implementation. + // Once that's fixed, both deep_copy's will fail at runtime because the + // destination execution space cannot access the source memory space. + try { + Kokkos::deep_copy(host_view, device_dynamic_view); + } catch (std::runtime_error const &error) { + std::string msg = error.what(); + std::cerr << "Copy from on-device DynamicView to on-host View failed:\n" << msg << std::endl; + } + try { + Kokkos::deep_copy(device_dynamic_view, host_view); + } catch (std::runtime_error const &error) { + std::string msg = error.what(); + std::cerr << "Copy from on-host View to on-device DynamicView failed:\n" << msg << std::endl; + } + } } }; From 67dff628b5ec815d10fbb7b23f00e2594fa4c4ca Mon Sep 17 00:00:00 2001 From: Todd Kordenbrock Date: Fri, 6 Jan 2023 12:19:24 -0600 Subject: [PATCH 023/496] fix broken DynamicView test case #4 - resize the DynamicView to the current test size - fix verification calculation --- containers/unit_tests/TestDynamicView.hpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/containers/unit_tests/TestDynamicView.hpp b/containers/unit_tests/TestDynamicView.hpp index a71b66de8c..b94272ddec 100644 --- a/containers/unit_tests/TestDynamicView.hpp +++ b/containers/unit_tests/TestDynamicView.hpp @@ -224,21 +224,24 @@ struct TestDynamicView { device_view_type device_view("on-device View", arg_total_size); host_view_type host_view("on-host View", arg_total_size); + unsigned da_size = arg_total_size / 8; + device_dynamic_view.resize_serial(da_size); + // Use parallel_for to populate device_dynamic_view and verify values #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( - Kokkos::RangePolicy(0, arg_total_size), + Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(i); }); value_type result_sum = 0.0; Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, arg_total_size), + Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i, value_type& partial_sum) { partial_sum += (value_type)device_dynamic_view(i); }, result_sum); - ASSERT_EQ(result_sum, (value_type)(arg_total_size * (arg_total_size - 1) / 2)); + ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); #endif // Use an on-device View as intermediate to deep_copy the @@ -249,20 +252,20 @@ struct TestDynamicView { Kokkos::deep_copy(device_view, host_view); #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( - Kokkos::RangePolicy(0, arg_total_size), + Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(0); }); #endif Kokkos::deep_copy(device_dynamic_view, device_view); #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) value_type new_result_sum = 0.0; Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, arg_total_size), + Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i, value_type& partial_sum) { partial_sum += (value_type)device_dynamic_view(i); }, new_result_sum); - ASSERT_EQ(new_result_sum, (value_type)(arg_total_size * (arg_total_size - 1) / 2)); + ASSERT_EQ(new_result_sum, (value_type)(da_size * (da_size - 1) / 2)); #endif // Try to deep_copy device_dynamic_view directly to/from host. From 6aa7bf6183bc027802f473e940d62e5c47d07867 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 6 Jan 2023 16:28:36 -0700 Subject: [PATCH 024/496] Remove KOKKOS_CXX_STANDARD mentioning from BUILD.md --- BUILD.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/BUILD.md b/BUILD.md index d482435db3..b0d603e6db 100644 --- a/BUILD.md +++ b/BUILD.md @@ -52,6 +52,10 @@ There are numerous device backends, options, and architecture-specific optimizat ```` which activates the OpenMP backend. All of the options controlling device backends, options, architectures, and third-party libraries (TPLs) are given below. +Kokkos requires as a minimum C++17, however C++20 and C++23 are supported depending on the compiler. + +The latest minimum compiler versions can be found in `cmake/kokkos_compiler_id.cmake`. + ## Known Issues ### Cray @@ -186,10 +190,6 @@ Options can be enabled by specifying `-DKokkos_ENABLE_X`. * Whether to enable test suite * BOOL Default: OFF -## Other Options -* Kokkos_CXX_STANDARD - * The C++ standard for Kokkos to use: c++14, c++17, c++20, or c++23. This should be given in CMake style as 14, 17, 20, or 23. - * STRING Default: 14 ## Third-party Libraries (TPLs) The following options control enabling TPLs: From edfb1e3ab56d09b212e670327e19d4761aa6e94e Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 6 Jan 2023 15:42:22 -0700 Subject: [PATCH 025/496] Fix -Werror with intel/19 Mark "alignment" variable as [[maybe_unused]] in get_shmem_common --- core/src/Kokkos_ScratchSpace.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/Kokkos_ScratchSpace.hpp b/core/src/Kokkos_ScratchSpace.hpp index cd0618f96f..f6e7ccec49 100644 --- a/core/src/Kokkos_ScratchSpace.hpp +++ b/core/src/Kokkos_ScratchSpace.hpp @@ -95,9 +95,9 @@ class ScratchMemorySpace { private: template - KOKKOS_INLINE_FUNCTION void* get_shmem_common(const IntType& size, - const ptrdiff_t alignment, - int level = -1) const { + KOKKOS_INLINE_FUNCTION void* get_shmem_common( + const IntType& size, [[maybe_unused]] const ptrdiff_t alignment, + int level = -1) const { if (level == -1) level = m_default_level; auto& m_iter = (level == 0) ? m_iter_L0 : m_iter_L1; auto& m_end = (level == 0) ? m_end_L0 : m_end_L1; From d2e574ce3358ba99e41c2aea8f6e8b318c96d4a5 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Sun, 8 Jan 2023 20:09:18 -0700 Subject: [PATCH 026/496] Apply clang-format --- containers/unit_tests/TestDynamicView.hpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/containers/unit_tests/TestDynamicView.hpp b/containers/unit_tests/TestDynamicView.hpp index b94272ddec..cd1511276f 100644 --- a/containers/unit_tests/TestDynamicView.hpp +++ b/containers/unit_tests/TestDynamicView.hpp @@ -220,9 +220,10 @@ struct TestDynamicView { using device_view_type = Kokkos::View; using host_view_type = typename Kokkos::View::HostMirror; - view_type device_dynamic_view("on-device DynamicView", 1024, arg_total_size); + view_type device_dynamic_view("on-device DynamicView", 1024, + arg_total_size); device_view_type device_view("on-device View", arg_total_size); - host_view_type host_view("on-host View", arg_total_size); + host_view_type host_view("on-host View", arg_total_size); unsigned da_size = arg_total_size / 8; device_dynamic_view.resize_serial(da_size); @@ -275,15 +276,17 @@ struct TestDynamicView { // destination execution space cannot access the source memory space. try { Kokkos::deep_copy(host_view, device_dynamic_view); - } catch (std::runtime_error const &error) { + } catch (std::runtime_error const& error) { std::string msg = error.what(); - std::cerr << "Copy from on-device DynamicView to on-host View failed:\n" << msg << std::endl; + std::cerr << "Copy from on-device DynamicView to on-host View failed:\n" + << msg << std::endl; } try { Kokkos::deep_copy(device_dynamic_view, host_view); - } catch (std::runtime_error const &error) { + } catch (std::runtime_error const& error) { std::string msg = error.what(); - std::cerr << "Copy from on-host View to on-device DynamicView failed:\n" << msg << std::endl; + std::cerr << "Copy from on-host View to on-device DynamicView failed:\n" + << msg << std::endl; } } } From 5d136ccda22dd102769f0fe14cb0a12703142f44 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 16 Dec 2022 09:54:01 -0500 Subject: [PATCH 027/496] Static asserts for reducers --- core/src/Kokkos_Parallel_Reduce.hpp | 41 +++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index d44bd89a9b..68a2155bbd 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -37,6 +37,7 @@ struct Sum { // Required using reducer = Sum; using value_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && !std::is_array_v); using result_view_type = Kokkos::View; @@ -81,6 +82,7 @@ struct Prod { // Required using reducer = Prod; using value_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && !std::is_array_v); using result_view_type = Kokkos::View; @@ -125,6 +127,7 @@ struct Min { // Required using reducer = Min; using value_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && !std::is_array_v); using result_view_type = Kokkos::View; @@ -171,6 +174,7 @@ struct Max { // Required using reducer = Max; using value_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && !std::is_array_v); using result_view_type = Kokkos::View; @@ -218,6 +222,7 @@ struct LAnd { // Required using reducer = LAnd; using value_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && !std::is_array_v); using result_view_type = Kokkos::View; @@ -263,6 +268,7 @@ struct LOr { // Required using reducer = LOr; using value_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && !std::is_array_v); using result_view_type = Kokkos::View; @@ -309,6 +315,7 @@ struct BAnd { // Required using reducer = BAnd; using value_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && !std::is_array_v); using result_view_type = Kokkos::View; @@ -355,6 +362,7 @@ struct BOr { // Required using reducer = BOr; using value_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && !std::is_array_v); using result_view_type = Kokkos::View; @@ -412,6 +420,9 @@ struct MinLoc { private: using scalar_type = std::remove_cv_t; using index_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && + !std::is_array_v); + static_assert(std::is_integral_v); public: // Required @@ -465,6 +476,9 @@ struct MaxLoc { private: using scalar_type = std::remove_cv_t; using index_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && + !std::is_array_v); + static_assert(std::is_integral_v); public: // Required @@ -528,6 +542,8 @@ template struct MinMax { private: using scalar_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && + !std::is_array_v); public: // Required @@ -599,6 +615,9 @@ struct MinMaxLoc { private: using scalar_type = std::remove_cv_t; using index_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && + !std::is_array_v); + static_assert(std::is_integral_v); public: // Required @@ -668,6 +687,9 @@ struct MaxFirstLoc { private: using scalar_type = std::remove_cv_t; using index_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && + !std::is_array_v); + static_assert(std::is_integral_v); public: // Required @@ -729,6 +751,9 @@ struct MaxFirstLocCustomComparator { private: using scalar_type = std::remove_cv_t; using index_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && + !std::is_array_v); + static_assert(std::is_integral_v); public: // Required @@ -795,6 +820,9 @@ struct MinFirstLoc { private: using scalar_type = std::remove_cv_t; using index_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && + !std::is_array_v); + static_assert(std::is_integral_v); public: // Required @@ -856,6 +884,9 @@ struct MinFirstLocCustomComparator { private: using scalar_type = std::remove_cv_t; using index_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && + !std::is_array_v); + static_assert(std::is_integral_v); public: // Required @@ -922,6 +953,9 @@ struct MinMaxFirstLastLoc { private: using scalar_type = std::remove_cv_t; using index_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && + !std::is_array_v); + static_assert(std::is_integral_v); public: // Required @@ -994,6 +1028,9 @@ struct MinMaxFirstLastLocCustomComparator { private: using scalar_type = std::remove_cv_t; using index_type = std::remove_cv_t; + static_assert(!std::is_pointer_v && + !std::is_array_v); + static_assert(std::is_integral_v); public: // Required @@ -1078,6 +1115,7 @@ template struct FirstLoc { private: using index_type = std::remove_cv_t; + static_assert(std::is_integral_v); public: // Required @@ -1141,6 +1179,7 @@ template struct LastLoc { private: using index_type = std::remove_cv_t; + static_assert(std::is_integral_v); public: // Required @@ -1207,6 +1246,7 @@ template struct StdIsPartitioned { private: using index_type = std::remove_cv_t; + static_assert(std::is_integral_v); public: // Required @@ -1278,6 +1318,7 @@ template struct StdPartitionPoint { private: using index_type = std::remove_cv_t; + static_assert(std::is_integral_v); public: // Required From 48e86920eccd9d91cbc86f5c32fbea2fe8c5b076 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 4 Jan 2023 16:54:08 +0000 Subject: [PATCH 028/496] Move Kokkos_OpenMP.hpp to OpenMP/Kokkos_OpenMP.hpp --- core/src/{ => OpenMP}/Kokkos_OpenMP.hpp | 0 core/src/OpenMP/Kokkos_OpenMP_Instance.hpp | 2 +- core/src/OpenMP/Kokkos_OpenMP_Task.hpp | 2 +- core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp | 2 +- core/src/decl/Kokkos_Declare_OPENMP.hpp | 2 +- core/src/impl/Kokkos_Default_Graph_Impl.hpp | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) rename core/src/{ => OpenMP}/Kokkos_OpenMP.hpp (100%) diff --git a/core/src/Kokkos_OpenMP.hpp b/core/src/OpenMP/Kokkos_OpenMP.hpp similarity index 100% rename from core/src/Kokkos_OpenMP.hpp rename to core/src/OpenMP/Kokkos_OpenMP.hpp diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index e2d52a141a..8f91636299 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -24,7 +24,7 @@ "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!" #endif -#include +#include #include #include diff --git a/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/core/src/OpenMP/Kokkos_OpenMP_Task.hpp index d6fd45ae9a..ff7e16c384 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Task.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include diff --git a/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp b/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp index 6cc52815de..8ad9f176d7 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp @@ -17,7 +17,7 @@ #ifndef KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP -#include +#include namespace Kokkos { namespace Impl { diff --git a/core/src/decl/Kokkos_Declare_OPENMP.hpp b/core/src/decl/Kokkos_Declare_OPENMP.hpp index 1e1314145d..35d0948216 100644 --- a/core/src/decl/Kokkos_Declare_OPENMP.hpp +++ b/core/src/decl/Kokkos_Declare_OPENMP.hpp @@ -18,7 +18,7 @@ #define KOKKOS_DECLARE_OPENMP_HPP #if defined(KOKKOS_ENABLE_OPENMP) -#include +#include #include #endif diff --git a/core/src/impl/Kokkos_Default_Graph_Impl.hpp b/core/src/impl/Kokkos_Default_Graph_Impl.hpp index d65b448f1f..4c133f69f6 100644 --- a/core/src/impl/Kokkos_Default_Graph_Impl.hpp +++ b/core/src/impl/Kokkos_Default_Graph_Impl.hpp @@ -24,7 +24,7 @@ #include #include -#include +#include // FIXME @graph other backends? #include From f92270b5eb0e3344e983c7190d55ba754443d20e Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 4 Jan 2023 17:01:56 +0000 Subject: [PATCH 029/496] Move part of Kokkos_OpenMP_Instance.cpp into Kokkos_OpenMP.cpp --- core/src/OpenMP/Kokkos_OpenMP.cpp | 89 ++++++++++++++++++++++ core/src/OpenMP/Kokkos_OpenMP_Instance.cpp | 64 ---------------- 2 files changed, 89 insertions(+), 64 deletions(-) create mode 100644 core/src/OpenMP/Kokkos_OpenMP.cpp diff --git a/core/src/OpenMP/Kokkos_OpenMP.cpp b/core/src/OpenMP/Kokkos_OpenMP.cpp new file mode 100644 index 0000000000..583fd7209a --- /dev/null +++ b/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -0,0 +1,89 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE +#endif + +#include +#include + +#include + +namespace Kokkos { + +OpenMP::OpenMP() + : m_space_instance(&Impl::OpenMPInternal::singleton(), + [](Impl::OpenMPInternal *) {}) { + Impl::OpenMPInternal::singleton().verify_is_initialized( + "OpenMP instance constructor"); +} + +OpenMP::OpenMP(int pool_size) + : m_space_instance(new Impl::OpenMPInternal(pool_size), + [](Impl::OpenMPInternal *ptr) { + ptr->finalize(); + delete ptr; + }) { + Impl::OpenMPInternal::singleton().verify_is_initialized( + "OpenMP instance constructor"); +} + +int OpenMP::impl_get_current_max_threads() noexcept { + return Impl::OpenMPInternal::get_current_max_threads(); +} + +void OpenMP::impl_initialize(InitializationSettings const &settings) { + Impl::OpenMPInternal::singleton().initialize( + settings.has_num_threads() ? settings.get_num_threads() : -1); +} + +void OpenMP::impl_finalize() { Impl::OpenMPInternal::singleton().finalize(); } + +void OpenMP::print_configuration(std::ostream &os, bool /*verbose*/) const { + os << "Host Parallel Execution Space:\n"; + os << " KOKKOS_ENABLE_OPENMP: yes\n"; + + os << "OpenMP Atomics:\n"; + os << " KOKKOS_ENABLE_OPENMP_ATOMICS: "; +#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS + os << "yes\n"; +#else + os << "no\n"; +#endif + + os << "\nOpenMP Runtime Configuration:\n"; + + m_space_instance->print_configuration(os); +} + +int OpenMP::concurrency(OpenMP const &instance) { + return impl_thread_pool_size(instance); +} + +void OpenMP::fence(const std::string &name) const { + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {}); +} + +namespace Impl { + +int g_openmp_space_factory_initialized = + initialize_space_factory("050_OpenMP"); + +} // namespace Impl + +} // namespace Kokkos diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp index b1ccac5156..e1434a4275 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp @@ -384,68 +384,4 @@ bool OpenMPInternal::verify_is_initialized(const char *const label) const { return m_initialized; } } // namespace Impl - -//---------------------------------------------------------------------------- - -OpenMP::OpenMP() - : m_space_instance(&Impl::OpenMPInternal::singleton(), - [](Impl::OpenMPInternal *) {}) { - Impl::OpenMPInternal::singleton().verify_is_initialized( - "OpenMP instance constructor"); -} - -OpenMP::OpenMP(int pool_size) - : m_space_instance(new Impl::OpenMPInternal(pool_size), - [](Impl::OpenMPInternal *ptr) { - ptr->finalize(); - delete ptr; - }) { - Impl::OpenMPInternal::singleton().verify_is_initialized( - "OpenMP instance constructor"); -} - -int OpenMP::impl_get_current_max_threads() noexcept { - return Impl::OpenMPInternal::get_current_max_threads(); -} - -void OpenMP::impl_initialize(InitializationSettings const &settings) { - Impl::OpenMPInternal::singleton().initialize( - settings.has_num_threads() ? settings.get_num_threads() : -1); -} - -void OpenMP::impl_finalize() { Impl::OpenMPInternal::singleton().finalize(); } - -void OpenMP::print_configuration(std::ostream &os, bool /*verbose*/) const { - os << "Host Parallel Execution Space:\n"; - os << " KOKKOS_ENABLE_OPENMP: yes\n"; - - os << "OpenMP Atomics:\n"; - os << " KOKKOS_ENABLE_OPENMP_ATOMICS: "; -#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS - os << "yes\n"; -#else - os << "no\n"; -#endif - - os << "\nOpenMP Runtime Configuration:\n"; - - m_space_instance->print_configuration(os); -} - -int OpenMP::concurrency(OpenMP const &instance) { - return impl_thread_pool_size(instance); -} - -void OpenMP::fence(const std::string &name) const { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {}); -} - -namespace Impl { - -int g_openmp_space_factory_initialized = - initialize_space_factory("050_OpenMP"); - -} // namespace Impl - } // namespace Kokkos From 2f7e94a8bfdc6e3a325d8029cb8f03a72230b957 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 4 Jan 2023 18:46:12 +0000 Subject: [PATCH 030/496] Move OpenMP functions out of Kokkos_OpenMP_Instance.hpp --- core/src/OpenMP/Kokkos_OpenMP.cpp | 33 ++++++ core/src/OpenMP/Kokkos_OpenMP.hpp | 87 +++++++++++++++- core/src/OpenMP/Kokkos_OpenMP_Instance.hpp | 111 --------------------- 3 files changed, 117 insertions(+), 114 deletions(-) diff --git a/core/src/OpenMP/Kokkos_OpenMP.cpp b/core/src/OpenMP/Kokkos_OpenMP.cpp index 583fd7209a..a35541257a 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -79,6 +79,39 @@ void OpenMP::fence(const std::string &name) const { name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {}); } +bool OpenMP::impl_is_initialized() noexcept { + return Impl::OpenMPInternal::singleton().is_initialized(); +} + +bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 + return ( + (exec_space.impl_internal_space_instance()->m_level < omp_get_level()) && + (!Impl::t_openmp_instance || + Impl::t_openmp_instance->m_level < omp_get_level())); +#else + return exec_space.impl_internal_space_instance()->m_level < omp_get_level(); +#endif +} + +int OpenMP::impl_thread_pool_size(OpenMP const &exec_space) noexcept { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 + return OpenMP::in_parallel(exec_space) + ? omp_get_num_threads() + : (Impl::t_openmp_instance + ? Impl::t_openmp_instance->m_pool_size + : exec_space.impl_internal_space_instance()->m_pool_size); +#else + return OpenMP::in_parallel(exec_space) + ? omp_get_num_threads() + : exec_space.impl_internal_space_instance()->m_pool_size; +#endif +} + +int OpenMP::impl_max_hardware_threads() noexcept { + return Impl::g_openmp_hardware_max_threads; +} + namespace Impl { int g_openmp_space_factory_initialized = diff --git a/core/src/OpenMP/Kokkos_OpenMP.hpp b/core/src/OpenMP/Kokkos_OpenMP.hpp index bbe008afd9..ded3e0b5cb 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -43,6 +43,8 @@ static_assert(false, #include #include +#include + #include /*--------------------------------------------------------------------------*/ @@ -81,7 +83,7 @@ class OpenMP { void print_configuration(std::ostream& os, bool verbose = false) const; /// \brief is the instance running a parallel algorithm - inline static bool in_parallel(OpenMP const& = OpenMP()) noexcept; + static bool in_parallel(OpenMP const& = OpenMP()) noexcept; /// \brief Wait until all dispatched functors complete on the given instance /// @@ -120,7 +122,7 @@ class OpenMP { /// \brief Free any resources being consumed by the default execution space static void impl_finalize(); - inline static int impl_thread_pool_size(OpenMP const& = OpenMP()) noexcept; + static int impl_thread_pool_size(OpenMP const& = OpenMP()) noexcept; /** \brief The rank of the executing thread in this thread pool */ inline static int impl_thread_pool_rank() noexcept; @@ -128,7 +130,7 @@ class OpenMP { inline static int impl_thread_pool_size(int depth, OpenMP const& = OpenMP()); // use UniqueToken - inline static int impl_max_hardware_threads() noexcept; + static int impl_max_hardware_threads() noexcept; // use UniqueToken KOKKOS_INLINE_FUNCTION @@ -154,6 +156,85 @@ class OpenMP { Kokkos::Impl::HostSharedPtr m_space_instance; }; +inline int OpenMP::impl_thread_pool_rank() noexcept { + // FIXME_OPENMP Can we remove this when removing partition_master? It's only + // used in one partition_master test +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 + KOKKOS_IF_ON_HOST( + (return Impl::t_openmp_instance ? 0 : omp_get_thread_num();)) +#else + KOKKOS_IF_ON_HOST((return omp_get_thread_num();)) +#endif + + KOKKOS_IF_ON_DEVICE((return -1;)) +} + +inline void OpenMP::impl_static_fence(std::string const& name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + GlobalDeviceSynchronization, + []() {}); +} + +inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept { + return false; +} + +inline int OpenMP::impl_thread_pool_size(int depth, OpenMP const& exec_space) { + return depth < 2 ? impl_thread_pool_size(exec_space) : 1; +} + +KOKKOS_INLINE_FUNCTION +int OpenMP::impl_hardware_thread_id() noexcept { + KOKKOS_IF_ON_HOST((return omp_get_thread_num();)) + + KOKKOS_IF_ON_DEVICE((return -1;)) +} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 +template +KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions, + int partition_size) { +#if _OPENMP >= 201511 + if (omp_get_max_active_levels() > 1) { +#else + if (omp_get_nested()) { +#endif + using Exec = Impl::OpenMPInternal; + + Exec* prev_instance = &Impl::OpenMPInternal::singleton(); + + Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions, + partition_size); + + OpenMP::memory_space space; + +#pragma omp parallel num_threads(num_partitions) + { + Exec thread_local_instance(partition_size); + Impl::t_openmp_instance = &thread_local_instance; + + size_t pool_reduce_bytes = 32 * partition_size; + size_t team_reduce_bytes = 32 * partition_size; + size_t team_shared_bytes = 1024 * partition_size; + size_t thread_local_bytes = 1024; + + thread_local_instance.resize_thread_data( + pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, + thread_local_bytes); + + omp_set_num_threads(partition_size); + f(omp_get_thread_num(), omp_get_num_threads()); + Impl::t_openmp_instance = nullptr; + } + } else { + // nested openmp not enabled + f(0, 1); + } +} +#endif + namespace Tools { namespace Experimental { template <> diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index 8f91636299..a842e9cdae 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -117,102 +117,6 @@ class OpenMPInternal { }; } // namespace Impl -inline bool OpenMP::impl_is_initialized() noexcept { - return Impl::OpenMPInternal::singleton().is_initialized(); -} - -inline bool OpenMP::in_parallel(OpenMP const& exec_space) noexcept { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return ( - (exec_space.impl_internal_space_instance()->m_level < omp_get_level()) && - (!Impl::t_openmp_instance || - Impl::t_openmp_instance->m_level < omp_get_level())); -#else - return exec_space.impl_internal_space_instance()->m_level < omp_get_level(); -#endif -} - -inline int OpenMP::impl_thread_pool_size(OpenMP const& exec_space) noexcept { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return OpenMP::in_parallel(exec_space) - ? omp_get_num_threads() - : (Impl::t_openmp_instance - ? Impl::t_openmp_instance->m_pool_size - : exec_space.impl_internal_space_instance()->m_pool_size); -#else - return OpenMP::in_parallel(exec_space) - ? omp_get_num_threads() - : exec_space.impl_internal_space_instance()->m_pool_size; -#endif -} - -inline int OpenMP::impl_thread_pool_rank() noexcept { - // FIXME_OPENMP Can we remove this when removing partition_master? It's only - // used in one partition_master test -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - KOKKOS_IF_ON_HOST( - (return Impl::t_openmp_instance ? 0 : omp_get_thread_num();)) -#else - KOKKOS_IF_ON_HOST((return omp_get_thread_num();)) -#endif - - KOKKOS_IF_ON_DEVICE((return -1;)) -} - -inline void OpenMP::impl_static_fence(std::string const& name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, - Kokkos::Tools::Experimental::SpecialSynchronizationCases:: - GlobalDeviceSynchronization, - []() {}); -} - -inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept { - return false; -} - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -template -KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions, - int partition_size) { -#if _OPENMP >= 201511 - if (omp_get_max_active_levels() > 1) { -#else - if (omp_get_nested()) { -#endif - using Exec = Impl::OpenMPInternal; - - Exec* prev_instance = &Impl::OpenMPInternal::singleton(); - - Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions, - partition_size); - - OpenMP::memory_space space; - -#pragma omp parallel num_threads(num_partitions) - { - Exec thread_local_instance(partition_size); - Impl::t_openmp_instance = &thread_local_instance; - - size_t pool_reduce_bytes = 32 * partition_size; - size_t team_reduce_bytes = 32 * partition_size; - size_t team_shared_bytes = 1024 * partition_size; - size_t thread_local_bytes = 1024; - - thread_local_instance.resize_thread_data( - pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, - thread_local_bytes); - - omp_set_num_threads(partition_size); - f(omp_get_thread_num(), omp_get_num_threads()); - Impl::t_openmp_instance = nullptr; - } - } else { - // nested openmp not enabled - f(0, 1); - } -} -#endif namespace Experimental { @@ -340,21 +244,6 @@ class UniqueToken { } // namespace Experimental -inline int OpenMP::impl_thread_pool_size(int depth, OpenMP const& exec_space) { - return depth < 2 ? impl_thread_pool_size(exec_space) : 1; -} - -KOKKOS_INLINE_FUNCTION -int OpenMP::impl_hardware_thread_id() noexcept { - KOKKOS_IF_ON_HOST((return omp_get_thread_num();)) - - KOKKOS_IF_ON_DEVICE((return -1;)) -} - -inline int OpenMP::impl_max_hardware_threads() noexcept { - return Impl::g_openmp_hardware_max_threads; -} - namespace Experimental { namespace Impl { // Partitioning an Execution Space: expects space and integer arguments for From fbfa01e0aeb96079a0aed2530aed7617f6ed2179 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 4 Jan 2023 21:08:22 +0000 Subject: [PATCH 031/496] Move OpenMP UniqueToken to its own file --- core/src/OpenMP/Kokkos_OpenMP_Instance.hpp | 102 -------------- core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp | 125 ++++++++++++++++++ core/src/decl/Kokkos_Declare_OPENMP.hpp | 1 + 3 files changed, 126 insertions(+), 102 deletions(-) create mode 100644 core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index a842e9cdae..08fc66a1a2 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -31,7 +31,6 @@ #include -#include #include #include @@ -141,107 +140,6 @@ class MasterLock { }; #endif -template <> -class UniqueToken { - private: - using buffer_type = Kokkos::View; - int m_count; - buffer_type m_buffer_view; - uint32_t volatile* m_buffer; - - public: - using execution_space = OpenMP; - using size_type = int; - - /// \brief create object size for concurrency on the given instance - /// - /// This object should not be shared between instances - UniqueToken(execution_space const& = execution_space()) noexcept - : m_count(::Kokkos::OpenMP::impl_thread_pool_size()), - m_buffer_view(buffer_type()), - m_buffer(nullptr) {} - - UniqueToken(size_type max_size, execution_space const& = execution_space()) - : m_count(max_size), - m_buffer_view("UniqueToken::m_buffer_view", - ::Kokkos::Impl::concurrent_bitset::buffer_bound(m_count)), - m_buffer(m_buffer_view.data()) {} - - /// \brief upper bound for acquired values, i.e. 0 <= value < size() - KOKKOS_INLINE_FUNCTION - int size() const noexcept { - KOKKOS_IF_ON_HOST((return m_count;)) - - KOKKOS_IF_ON_DEVICE((return 0;)) - } - - /// \brief acquire value such that 0 <= value < size() - KOKKOS_INLINE_FUNCTION - int acquire() const noexcept { - KOKKOS_IF_ON_HOST( - (if (m_count >= ::Kokkos::OpenMP::impl_thread_pool_size()) return :: - Kokkos::OpenMP::impl_thread_pool_rank(); - const ::Kokkos::pair result = - ::Kokkos::Impl::concurrent_bitset::acquire_bounded( - m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count); - - if (result.first < 0) { - ::Kokkos::abort( - "UniqueToken failure to acquire tokens, no tokens " - "available"); - } - - return result.first;)) - - KOKKOS_IF_ON_DEVICE((return 0;)) - } - - /// \brief release a value acquired by generate - KOKKOS_INLINE_FUNCTION - void release(int i) const noexcept { - KOKKOS_IF_ON_HOST( - (if (m_count < ::Kokkos::OpenMP::impl_thread_pool_size()) { - ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i); - })) - - KOKKOS_IF_ON_DEVICE(((void)i;)) - } -}; - -template <> -class UniqueToken { - public: - using execution_space = OpenMP; - using size_type = int; - - /// \brief create object size for concurrency on the given instance - /// - /// This object should not be shared between instances - UniqueToken(execution_space const& = execution_space()) noexcept {} - - /// \brief upper bound for acquired values, i.e. 0 <= value < size() - KOKKOS_INLINE_FUNCTION - int size() const noexcept { - KOKKOS_IF_ON_HOST((return Kokkos::Impl::g_openmp_hardware_max_threads;)) - - KOKKOS_IF_ON_DEVICE((return 0;)) - } - - /// \brief acquire value such that 0 <= value < size() - // FIXME this is wrong when using nested parallelism. In that case multiple - // threads have the same thread ID. - KOKKOS_INLINE_FUNCTION - int acquire() const noexcept { - KOKKOS_IF_ON_HOST((return omp_get_thread_num();)) - - KOKKOS_IF_ON_DEVICE((return 0;)) - } - - /// \brief release a value acquired by generate - KOKKOS_INLINE_FUNCTION - void release(int) const noexcept {} -}; - } // namespace Experimental namespace Experimental { diff --git a/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp b/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp new file mode 100644 index 0000000000..0f195aa06d --- /dev/null +++ b/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp @@ -0,0 +1,125 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMP_UNIQUE_TOKEN_HPP +#define KOKKOS_OPENMP_UNIQUE_TOKEN_HPP + +#include + +namespace Kokkos::Experimental { +template <> +class UniqueToken { + private: + using buffer_type = Kokkos::View; + int m_count; + buffer_type m_buffer_view; + uint32_t volatile* m_buffer; + + public: + using execution_space = OpenMP; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken(execution_space const& = execution_space()) noexcept + : m_count(::Kokkos::OpenMP::impl_thread_pool_size()), + m_buffer_view(buffer_type()), + m_buffer(nullptr) {} + + UniqueToken(size_type max_size, execution_space const& = execution_space()) + : m_count(max_size), + m_buffer_view("UniqueToken::m_buffer_view", + ::Kokkos::Impl::concurrent_bitset::buffer_bound(m_count)), + m_buffer(m_buffer_view.data()) {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept { + KOKKOS_IF_ON_HOST((return m_count;)) + + KOKKOS_IF_ON_DEVICE((return 0;)) + } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept { + KOKKOS_IF_ON_HOST( + (if (m_count >= ::Kokkos::OpenMP::impl_thread_pool_size()) return :: + Kokkos::OpenMP::impl_thread_pool_rank(); + const ::Kokkos::pair result = + ::Kokkos::Impl::concurrent_bitset::acquire_bounded( + m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count); + + if (result.first < 0) { + ::Kokkos::abort( + "UniqueToken failure to acquire tokens, no tokens " + "available"); + } + + return result.first;)) + + KOKKOS_IF_ON_DEVICE((return 0;)) + } + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release(int i) const noexcept { + KOKKOS_IF_ON_HOST( + (if (m_count < ::Kokkos::OpenMP::impl_thread_pool_size()) { + ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i); + })) + + KOKKOS_IF_ON_DEVICE(((void)i;)) + } +}; + +template <> +class UniqueToken { + public: + using execution_space = OpenMP; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken(execution_space const& = execution_space()) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept { + KOKKOS_IF_ON_HOST((return Kokkos::Impl::g_openmp_hardware_max_threads;)) + + KOKKOS_IF_ON_DEVICE((return 0;)) + } + + /// \brief acquire value such that 0 <= value < size() + // FIXME this is wrong when using nested parallelism. In that case multiple + // threads have the same thread ID. + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept { + KOKKOS_IF_ON_HOST((return omp_get_thread_num();)) + + KOKKOS_IF_ON_DEVICE((return 0;)) + } + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release(int) const noexcept {} +}; +} // namespace Kokkos::Experimental + +#endif diff --git a/core/src/decl/Kokkos_Declare_OPENMP.hpp b/core/src/decl/Kokkos_Declare_OPENMP.hpp index 35d0948216..ef9d27108e 100644 --- a/core/src/decl/Kokkos_Declare_OPENMP.hpp +++ b/core/src/decl/Kokkos_Declare_OPENMP.hpp @@ -20,6 +20,7 @@ #if defined(KOKKOS_ENABLE_OPENMP) #include #include +#include #endif #endif From 74a7988bf3217d8fbc22abcde5bd7fd820678f49 Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Mon, 9 Jan 2023 16:59:48 -0500 Subject: [PATCH 032/496] Minor bug fixes on CMake and Make configurations for the OpenACC backend. --- Makefile.kokkos | 4 ++-- core/src/Kokkos_Macros.hpp | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 5cc69bf054..815dd01a39 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1230,8 +1230,8 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index e8067dc5a5..f251c2dab9 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -522,6 +522,7 @@ static constexpr bool kokkos_omp_on_host() { return false; } KOKKOS_IMPL_STRIP_PARENS(CODE) \ } #else +#include // FIXME_OPENACC acc_on_device is a non-constexpr function #define KOKKOS_IF_ON_DEVICE(CODE) \ if constexpr (acc_on_device(acc_device_not_host)) { \ From cf04bb5259c110a26e4d2accd62aa36da933026d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 9 Jan 2023 18:48:20 -0700 Subject: [PATCH 033/496] [ci skip] update test_all_sandia update sems-archive-env to sems-env on sogpu, sems machines add new compilers, drop unsupported compilers on various machines delete unused machine (kokkos-dev, white) --- scripts/testing_scripts/test_all_sandia | 245 ++++++------------------ 1 file changed, 61 insertions(+), 184 deletions(-) diff --git a/scripts/testing_scripts/test_all_sandia b/scripts/testing_scripts/test_all_sandia index 40c30ba7f4..a3dc9ffd34 100755 --- a/scripts/testing_scripts/test_all_sandia +++ b/scripts/testing_scripts/test_all_sandia @@ -105,10 +105,6 @@ if [[ "$HOSTNAME" == caraway* ]]; then MACHINE=caraway fi -if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then - MACHINE=kokkos-dev -fi - if [[ "$HOSTNAME" == sogpu01* ]]; then MACHINE=sogpu fi @@ -272,18 +268,12 @@ fi # if [ "$MACHINE" = "sems" ]; then - source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh - - # On unnamed sems machines, assume more restricted rhel7 environment - # On rhel7 sems machines gcc/7.3.0, clang/4.0.1, and intel/16.0.3 are missing - # Remove kokkkos-env module use + module purge + MODULE_ENVIRONMENT="sh /projects/sems/modulefiles/utils/sems-v2-modules-init.sh" + eval "$MODULE_ENVIRONMENT" - module load sems-archive-cmake/3.17.1 - BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-/" - OLDINTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.4.0,sems-archive-/" - INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-/" - CLANG_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-/" - CUDA9_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-/" + module load sems-cmake sems-git + BASE_MODULE_LIST="sems-cmake,sems-/" SKIP_HWLOC=True # No sems hwloc module @@ -291,130 +281,61 @@ if [ "$MACHINE" = "sems" ]; then ARCH_FLAG="" fi - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/17.0.1 $INTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "clang/5.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/7.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/10.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - fi + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/10.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "clang/11.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/14.0.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.1.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/2021.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + ) + elif [ "$MACHINE" = "sogpu" ]; then - source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh + MODULE_ENVIRONMENT="sh /projects/sems/modulefiles/utils/sems-v2-modules-init.sh" + eval "$MODULE_ENVIRONMENT" - module load sems-archive-cmake/3.17.1 sems-archive-git - BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-/" - OLDINTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.4.0,sems-archive-/" - INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-/" - CLANG_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-/" - CUDA_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-/" - CUDA11_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/8.3.0,sems-archive-/" + module load sems-cmake sems-git + BASE_MODULE_LIST="sems-cmake,sems-/" + CUDA11_MODULE_LIST="sems-cmake,sems-gcc/8.3.0,sems-/" SKIP_HWLOC=True # No sems hwloc module + echo "." + module list if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=Volta70" fi - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "clang/5.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/7.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/10.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "cuda/10.1 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/11.1 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) -elif [ "$MACHINE" = "kokkos-dev" ]; then - source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh - - module load sems-archive-cmake/3.17.1 - BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-/" - OLDINTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.4.0,sems-archive-/" - INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-/" - CLANG_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-/" - CUDA9_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.1.0,sems-archive-/" - CUDA10_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-/" - CUDA11_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-/" - CLANG7_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-cuda/9.2,sems-archive-/" - SKIP_HWLOC=True - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=Kepler35" - fi + echo "." + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/10.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "clang/11.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/14.0.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.1.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/2021.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "cuda/11.1.0 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.4.2 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" - "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS" - "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Pthread_Serial" icpc $INTEL_WARNING_FLAGS" - "clang/5.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS" - "cuda/9.2 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/7.0.1 $CLANG7_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/10.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "cuda/10.1 $CUDA10_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/11.1 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - fi + echo "." + echo "COMPILER VAR : $COMPILERS" elif [ "$MACHINE" = "weaver" ]; then source /etc/profile.d/modules.sh SKIP_HWLOC=True - # For rhel7W queue - BASE_MODULE_LIST="cmake/3.19.3,/" - - # For rhel8 queue - # Cuda/11 modules available only on the rhel8 queue (rhel8 OS) - RHEL8_BASE_MODULE_LIST="cmake/3.21.2,/" - RHEL8_CUDA11_MODULE_LIST="cmake/3.21.2,/" + # Cuda/11 modules available only on the dev queue (rhel8 OS); gcc/8.3.1 loaded by default + CUDA11_MODULE_LIST="cmake/3.21.2,/" # Don't do pthread with Power - GCC_IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" + GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("cuda/11.2.2 $CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gcc/9.3.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" @@ -432,9 +353,7 @@ elif [ "$MACHINE" = "voltrino" ]; then BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/20.11.4a,/,gcc/9.3.0" # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/17.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/19.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + COMPILERS=("intel/19.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" ) if [ -z "$ARCH_FLAG" ]; then @@ -448,8 +367,7 @@ elif [ "$MACHINE" = "mayer" ]; then BASE_MODULE_LIST="cmake/3.17.1,/" # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gnu7/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gnu9/9.3.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("gnu9/9.3.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "arm/20.1 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS") if [ -z "$ARCH_FLAG" ]; then @@ -460,15 +378,12 @@ elif [ "$MACHINE" = "caraway" ]; then SKIP_HWLOC=True BASE_MODULE_LIST="cmake/3.19.3,/" - # Cuda11 usage available on the V100 queue - CUDA11_MODULE_LIST="cmake/3.22.2,/,gcc/8.2.0" HIPCLANG_BUILD_LIST="Hip_Serial,Hip_OpenMP" HIPCLANG_WARNING_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG" # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("rocm/5.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" - "cuda/11.4 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" @@ -491,13 +406,6 @@ elif [ "$MACHINE" = "blake" ]; then BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,/oneAPI/base-toolkit/" ONEAPI_WARNING_FLAGS="" - if [ "$SPOT_CHECK" = "True" ]; then - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - ) - else COMPILERS=("intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/2021.1.1 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" "gcc/8.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" @@ -506,77 +414,46 @@ elif [ "$MACHINE" = "blake" ]; then "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) - fi if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=SKX" fi elif [ "$MACHINE" = "kokkos-dev-2" ]; then + module purge source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh module use /home/projects/x86-64/modulefiles/local - module purge module load sems-archive-env module load sems-archive-git - module load sems-archive-tex module load sems-archive-cmake/3.17.1 module load sems-archive-gdb SKIP_HWLOC=True BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-/" - OLDINTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.4.0,sems-archive-/" - INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-/" + INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/8.3.0,sems-archive-/" CLANG_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-/" - CLANG8_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,cuda/10.0,/" GCC91_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,/" - NVCC9_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/5.3.0,sems-archive-/" - NVCC_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.3.0,/" NVCC_SEMSMODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.3.0,sems-archive-/" - NVCC11_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,/" + NVCC_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,/" - BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Pthread" - BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_OpenMP" BUILD_LIST_CLANG="Serial,Pthread,OpenMP" - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" - "gcc/8.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" - "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS" - "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "cuda/10.1 $NVCC_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("cuda/10.0 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/10.1 $NVCC_SEMSMODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/11.0 $NVCC11_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/11.1 $NVCC_SEMSMODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/11.2 $NVCC11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/9.2 $NVCC9_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" - "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/9.1 $GCC91_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS" - "gcc/9.2.0 $BASE_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS" - "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/5.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/7.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/10.0.0 $CLANG_BASE_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" - ) - fi + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("cuda/11.1 $NVCC_SEMSMODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.2 $NVCC_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.7 $NVCC_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/12.0 $NVCC_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.1 $GCC91_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS" + "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/10.0.0 $CLANG_BASE_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" + ) if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=SNB,Volta70" + ARCH_FLAG="--arch=Volta70" fi else From 761ffda657411912ea8a8fc4620a88bed0676126 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Tue, 10 Jan 2023 15:34:24 -0700 Subject: [PATCH 034/496] Fix HIP Global Launch with HSA_XNACK=1 Looks like there is a race condition on use of the driver argument. --- core/src/HIP/Kokkos_HIP_Instance.cpp | 22 ++++++++++++++++++++++ core/src/HIP/Kokkos_HIP_Instance.hpp | 2 ++ core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 10 ++++++---- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 67d650ad49..1664ac4a32 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -258,6 +258,28 @@ Kokkos::HIP::size_type *HIPInternal::scratch_functor( return m_scratchFunctor; } +Kokkos::HIP::size_type *HIPInternal::scratch_functor_host( + const std::size_t size) const { + if (verify_is_initialized("scratch_functor_host") && m_scratchFunctorSize < size) { + m_scratchFunctorSize = size; + + using Record = Kokkos::Impl::SharedAllocationRecord; + + if (m_scratchFunctorHost) + Record::decrement(Record::get_record(m_scratchFunctorHost)); + + Record *const r = + Record::allocate(Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost", + m_scratchFunctorSize); + + Record::increment(r); + + m_scratchFunctorHost = reinterpret_cast(r->data()); + } + + return m_scratchFunctor; +} + int HIPInternal::acquire_team_scratch_space() { int current_team_scratch = 0; int zero = 0; diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp index 7fcd499cfb..6530fa960c 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -90,6 +90,7 @@ class HIPInternal { size_type *m_scratchSpace = nullptr; size_type *m_scratchFlags = nullptr; mutable size_type *m_scratchFunctor = nullptr; + mutable size_type *m_scratchFunctorHost = nullptr; hipStream_t m_stream = nullptr; uint32_t m_instance_id = @@ -136,6 +137,7 @@ class HIPInternal { size_type *scratch_space(const std::size_t size); size_type *scratch_flags(const std::size_t size); size_type *scratch_functor(const std::size_t size) const; + size_type *scratch_functor_host(const std::size_t size) const; uint32_t impl_get_instance_id() const noexcept; int acquire_team_scratch_space(); // Resizing of team level 1 scratch diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 0a3e6b108a..0821972f54 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -379,10 +379,12 @@ struct HIPParallelLaunchKernelInvoker( hip_instance->scratch_functor(sizeof(DriverType))); - - hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), hipMemcpyDefault, - hip_instance->m_stream); - + DriverType *driver_ptr_host = reinterpret_cast( + hip_instance->scratch_functor_host(sizeof(DriverType))); + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(hip_instance->m_stream)); + std::memcpy(driver_ptr_host,&driver,sizeof(DriverType)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyAsync(driver_ptr, driver_ptr_host, sizeof(DriverType), hipMemcpyDefault, + hip_instance->m_stream)); (base_t::get_kernel_func())<<m_stream>>>( driver_ptr); } From 00ab7630d30bd8b2213f044e49595c7ac7b43af6 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 11 Jan 2023 07:45:46 -0500 Subject: [PATCH 035/496] Fixup forgot to add new OpenMP source file in Makefile --- Makefile.targets | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.targets b/Makefile.targets index 32b1fab261..185c7067bd 100644 --- a/Makefile.targets +++ b/Makefile.targets @@ -89,6 +89,8 @@ Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokk endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) +Kokkos_OpenMP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP.cpp Kokkos_OpenMP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp From cf4358ee9484c446d979b956ecdd2b381b925e99 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 11 Jan 2023 09:47:14 -0500 Subject: [PATCH 036/496] Add more comments --- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 0821972f54..c92d630d70 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -377,14 +377,20 @@ struct HIPParallelLaunchKernelInvoker( hip_instance->scratch_functor(sizeof(DriverType))); DriverType *driver_ptr_host = reinterpret_cast( hip_instance->scratch_functor_host(sizeof(DriverType))); KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(hip_instance->m_stream)); - std::memcpy(driver_ptr_host,&driver,sizeof(DriverType)); - KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyAsync(driver_ptr, driver_ptr_host, sizeof(DriverType), hipMemcpyDefault, - hip_instance->m_stream)); + std::memcpy(driver_ptr_host, &driver, sizeof(DriverType)); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemcpyAsync(driver_ptr, driver_ptr_host, sizeof(DriverType), + hipMemcpyDefault, hip_instance->m_stream)); (base_t::get_kernel_func())<<m_stream>>>( driver_ptr); } From cde661d709ea9226287a8c51480afd3ce0a73ebe Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 11 Jan 2023 14:48:45 -0500 Subject: [PATCH 037/496] Update Kokkos version on develop --- CMakeLists.txt | 4 ++-- Makefile.kokkos | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ad0500358..39fa5bbe96 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,8 +137,8 @@ ELSEIF (NOT CMAKE_SIZEOF_VOID_P EQUAL 8) ENDIF() -set(Kokkos_VERSION_MAJOR 3) -set(Kokkos_VERSION_MINOR 7) +set(Kokkos_VERSION_MAJOR 4) +set(Kokkos_VERSION_MINOR 0) set(Kokkos_VERSION_PATCH 99) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") diff --git a/Makefile.kokkos b/Makefile.kokkos index 815dd01a39..d0fc1f7e3b 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1,7 +1,7 @@ # Default settings common options. -KOKKOS_VERSION_MAJOR = 3 -KOKKOS_VERSION_MINOR = 7 +KOKKOS_VERSION_MAJOR = 4 +KOKKOS_VERSION_MINOR = 0 KOKKOS_VERSION_PATCH = 99 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) From 487deee1c471cce9323b22fd71c479100b5414fc Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 11 Jan 2023 16:29:33 -0500 Subject: [PATCH 038/496] Apply clang-format --- core/src/HIP/Kokkos_HIP_Instance.cpp | 12 +++++++----- core/src/HIP/Kokkos_HIP_Instance.hpp | 6 +++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 1664ac4a32..0d9a76ccb7 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -260,17 +260,19 @@ Kokkos::HIP::size_type *HIPInternal::scratch_functor( Kokkos::HIP::size_type *HIPInternal::scratch_functor_host( const std::size_t size) const { - if (verify_is_initialized("scratch_functor_host") && m_scratchFunctorSize < size) { + if (verify_is_initialized("scratch_functor_host") && + m_scratchFunctorSize < size) { m_scratchFunctorSize = size; - using Record = Kokkos::Impl::SharedAllocationRecord; + using Record = + Kokkos::Impl::SharedAllocationRecord; if (m_scratchFunctorHost) Record::decrement(Record::get_record(m_scratchFunctorHost)); - Record *const r = - Record::allocate(Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost", - m_scratchFunctorSize); + Record *const r = Record::allocate(Kokkos::HIPHostPinnedSpace(), + "Kokkos::InternalScratchFunctorHost", + m_scratchFunctorSize); Record::increment(r); diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp index 6530fa960c..597c829142 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -87,9 +87,9 @@ class HIPInternal { std::size_t m_scratchFlagsCount = 0; mutable std::size_t m_scratchFunctorSize = 0; - size_type *m_scratchSpace = nullptr; - size_type *m_scratchFlags = nullptr; - mutable size_type *m_scratchFunctor = nullptr; + size_type *m_scratchSpace = nullptr; + size_type *m_scratchFlags = nullptr; + mutable size_type *m_scratchFunctor = nullptr; mutable size_type *m_scratchFunctorHost = nullptr; hipStream_t m_stream = nullptr; From 296de1291fec4f51a681ce384650f260c831f134 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 11 Jan 2023 16:51:10 -0500 Subject: [PATCH 039/496] Return host functor instead of device one Co-authored-by: Damien L-G --- core/src/HIP/Kokkos_HIP_Instance.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 0d9a76ccb7..865a936854 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -279,7 +279,7 @@ Kokkos::HIP::size_type *HIPInternal::scratch_functor_host( m_scratchFunctorHost = reinterpret_cast(r->data()); } - return m_scratchFunctor; + return m_scratchFunctorHost; } int HIPInternal::acquire_team_scratch_space() { From 46aae0f142105b518c4d05621768c628d1e038c2 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 11 Jan 2023 21:47:06 -0500 Subject: [PATCH 040/496] Desul atomics fixup detect use of SYCL --- tpls/desul/include/desul/atomics/Macros.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpls/desul/include/desul/atomics/Macros.hpp b/tpls/desul/include/desul/atomics/Macros.hpp index 61e14783d5..61baccf8e3 100644 --- a/tpls/desul/include/desul/atomics/Macros.hpp +++ b/tpls/desul/include/desul/atomics/Macros.hpp @@ -39,7 +39,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #define DESUL_HAVE_HIP_ATOMICS #endif -#ifdef __SYCL_DEVICE_ONLY__ +#ifdef SYCL_LANGUAGE_VERSION #define DESUL_HAVE_SYCL_ATOMICS #endif From 0e3848f0b4dc09c2c6718b9d9e9bdef5d6f65d9c Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 11 Jan 2023 22:55:42 -0500 Subject: [PATCH 041/496] Desul atomics: drop unnecessary macro guard that checks for__CUDA_ARCH__ in compare exchange --- tpls/desul/include/desul/atomics/Compare_Exchange_CUDA.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange_CUDA.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange_CUDA.hpp index 986c5b6f07..456d6b011e 100644 --- a/tpls/desul/include/desul/atomics/Compare_Exchange_CUDA.hpp +++ b/tpls/desul/include/desul/atomics/Compare_Exchange_CUDA.hpp @@ -138,7 +138,6 @@ __device__ std::enable_if_t device_atomic_e // SeqCst is not directly supported by PTX, need the additional fences: -#if defined(__CUDA_ARCH__) || !defined(__NVCC__) namespace desul { namespace Impl { template @@ -240,6 +239,5 @@ device_atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) { } } // namespace Impl } // namespace desul -#endif #endif From 0986a3a86327ad3f4113f0b55a253005648c4234 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 11 Jan 2023 22:56:03 -0500 Subject: [PATCH 042/496] Desul atomics: drop unnecessary macro guard that checks for__CUDA_ARCH__ in PTX assembly code --- tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp b/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp index 13a733dc61..96e0dfa269 100644 --- a/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp +++ b/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp @@ -1,7 +1,6 @@ #include namespace desul { namespace Impl { -#if defined(__CUDA_ARCH__) || (defined(__clang__) && !defined(__NVCC__)) // Choose the variant of atomics we are using later #if !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE) && \ !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL) @@ -14,6 +13,5 @@ namespace Impl { #endif #include -#endif } // namespace Impl } // namespace desul From 49b00de7817741243eb39bfd206b1c8b314998b3 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 12 Jan 2023 09:15:22 -0500 Subject: [PATCH 043/496] CMake: change package COMPATIBILITY mode {SameMajorVersion -> AnyNewerVersion} --- cmake/kokkos_install.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kokkos_install.cmake b/cmake/kokkos_install.cmake index ff66d015fb..c65c2af52b 100644 --- a/cmake/kokkos_install.cmake +++ b/cmake/kokkos_install.cmake @@ -19,7 +19,7 @@ IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) WRITE_BASIC_PACKAGE_VERSION_FILE("${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" - COMPATIBILITY SameMajorVersion) + COMPATIBILITY AnyNewerVersion) # Install the KokkosConfig*.cmake files install(FILES From cd0b63125928e7e299f466bad6786381adce5f27 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Thu, 12 Jan 2023 11:17:37 -0500 Subject: [PATCH 044/496] Encapsulate staging inside scratch_functor --- core/src/HIP/Kokkos_HIP_Instance.cpp | 50 ++---------------------- core/src/HIP/Kokkos_HIP_Instance.hpp | 49 ++++++++++++++++++++++- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 16 +------- 3 files changed, 52 insertions(+), 63 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 865a936854..5bf3c868d3 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -236,52 +236,6 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { return m_scratchFlags; } -Kokkos::HIP::size_type *HIPInternal::scratch_functor( - const std::size_t size) const { - if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { - m_scratchFunctorSize = size; - - using Record = Kokkos::Impl::SharedAllocationRecord; - - if (m_scratchFunctor) - Record::decrement(Record::get_record(m_scratchFunctor)); - - Record *const r = - Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor", - m_scratchFunctorSize); - - Record::increment(r); - - m_scratchFunctor = reinterpret_cast(r->data()); - } - - return m_scratchFunctor; -} - -Kokkos::HIP::size_type *HIPInternal::scratch_functor_host( - const std::size_t size) const { - if (verify_is_initialized("scratch_functor_host") && - m_scratchFunctorSize < size) { - m_scratchFunctorSize = size; - - using Record = - Kokkos::Impl::SharedAllocationRecord; - - if (m_scratchFunctorHost) - Record::decrement(Record::get_record(m_scratchFunctorHost)); - - Record *const r = Record::allocate(Kokkos::HIPHostPinnedSpace(), - "Kokkos::InternalScratchFunctorHost", - m_scratchFunctorSize); - - Record::increment(r); - - m_scratchFunctorHost = reinterpret_cast(r->data()); - } - - return m_scratchFunctorHost; -} - int HIPInternal::acquire_team_scratch_space() { int current_team_scratch = 0; int zero = 0; @@ -342,8 +296,10 @@ void HIPInternal::finalize() { RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags)); RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace)); - if (m_scratchFunctorSize > 0) + if (m_scratchFunctorSize > 0) { RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctor)); + RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctorHost)); + } } for (int i = 0; i < m_n_team_scratch; ++i) { diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp index 597c829142..505a63b5b3 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -19,6 +19,7 @@ #ifndef KOKKOS_HIP_INSTANCE_HPP #define KOKKOS_HIP_INSTANCE_HPP +#include #include #include @@ -136,8 +137,8 @@ class HIPInternal { // Resizing of reduction related scratch spaces size_type *scratch_space(const std::size_t size); size_type *scratch_flags(const std::size_t size); - size_type *scratch_functor(const std::size_t size) const; - size_type *scratch_functor_host(const std::size_t size) const; + template + size_type *scratch_functor(DriverType const &driver) const; uint32_t impl_get_instance_id() const noexcept; int acquire_team_scratch_space(); // Resizing of team level 1 scratch @@ -146,6 +147,50 @@ class HIPInternal { void release_team_scratch_space(int scratch_pool_id); }; +template +Kokkos::HIP::size_type *HIPInternal::scratch_functor( + DriverType const &driver) const { + std::size_t size = sizeof(DriverType); + if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { + m_scratchFunctorSize = size; + + using Record = Kokkos::Impl::SharedAllocationRecord; + using RecordHost = + Kokkos::Impl::SharedAllocationRecord; + + if (m_scratchFunctor) { + Record::decrement(Record::get_record(m_scratchFunctor)); + RecordHost::decrement(RecordHost::get_record(m_scratchFunctorHost)); + } + + Record *const r = + Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor", + m_scratchFunctorSize); + RecordHost *const r_host = RecordHost::allocate( + Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost", + m_scratchFunctorSize); + + Record::increment(r); + RecordHost::increment(r_host); + + m_scratchFunctor = reinterpret_cast(r->data()); + m_scratchFunctorHost = reinterpret_cast(r_host->data()); + } + + // When using HSA_XNACK=1, it is necessary to copy the driver to the host to + // ensure that the driver is not destroyed before the computation is done. + // Without this fix, all the atomic tests fail. It is not obvious that this + // problem is limited to HSA_XNACK=1 even if all the tests pass when + // HSA_XNACK=0. That's why we always copy the driver. + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); + std::memcpy(m_scratchFunctorHost, &driver, size); + KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyAsync(m_scratchFunctor, + m_scratchFunctorHost, size, + hipMemcpyDefault, m_stream)); + + return m_scratchFunctor; +} + } // namespace Impl namespace Experimental { diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index c92d630d70..f4f221e276 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -377,20 +377,8 @@ struct HIPParallelLaunchKernelInvoker( - hip_instance->scratch_functor(sizeof(DriverType))); - DriverType *driver_ptr_host = reinterpret_cast( - hip_instance->scratch_functor_host(sizeof(DriverType))); - KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(hip_instance->m_stream)); - std::memcpy(driver_ptr_host, &driver, sizeof(DriverType)); - KOKKOS_IMPL_HIP_SAFE_CALL( - hipMemcpyAsync(driver_ptr, driver_ptr_host, sizeof(DriverType), - hipMemcpyDefault, hip_instance->m_stream)); + DriverType *driver_ptr = + reinterpret_cast(hip_instance->scratch_functor(driver)); (base_t::get_kernel_func())<<m_stream>>>( driver_ptr); } From 1f68ab49e9631dbeb707415566b0efba1163882c Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 12 Jan 2023 13:14:03 -0500 Subject: [PATCH 045/496] Desul atomics cleanup enable GCC or MSVC atomics --- tpls/desul/include/desul/atomics/Macros.hpp | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Macros.hpp b/tpls/desul/include/desul/atomics/Macros.hpp index 61e14783d5..8b57e50042 100644 --- a/tpls/desul/include/desul/atomics/Macros.hpp +++ b/tpls/desul/include/desul/atomics/Macros.hpp @@ -11,26 +11,16 @@ SPDX-License-Identifier: (BSD-3-Clause) // Macros -#if (!defined(__CUDA_ARCH__) || !defined(__NVCC__)) && \ - (!defined(__HIP_DEVICE_COMPILE) || !defined(__HIP_PLATFORM_HCC__)) && \ - !defined(__SYCL_DEVICE_ONLY__) && !defined(DESUL_HAVE_OPENMP_ATOMICS) -#define DESUL_IMPL_HAVE_GCC_OR_MSVC_ATOMICS -#endif - -// ONLY use GNUC atomics if not compiling for the device -// and we didn't explicitly say to use OpenMP atomics -#if defined(__GNUC__) && defined(DESUL_IMPL_HAVE_GCC_OR_MSVC_ATOMICS) +// ONLY use GNUC atomics if not explicitly say to use OpenMP atomics +#if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(__GNUC__) #define DESUL_HAVE_GCC_ATOMICS #endif -// Equivalent to above: if we are compiling for the device we -// need to use CUDA/HIP/SYCL atomics instead of MSVC atomics -#if defined(_MSC_VER) && defined(DESUL_IMPL_HAVE_GCC_OR_MSVC_ATOMICS) +// Equivalent to above for MSVC atomics +#if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(_MSC_VER) #define DESUL_HAVE_MSVC_ATOMICS #endif -#undef DESUL_IMPL_HAVE_GCC_OR_MSVC_ATOMICS - #ifdef __CUDACC__ #define DESUL_HAVE_CUDA_ATOMICS #endif From 33d7fce6c29cf0f172b6847b0d65beabd448b332 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 12 Jan 2023 16:33:16 -0500 Subject: [PATCH 046/496] Fix compiling with OpenMP and Kokkos_ENABLE_DEPRECATED_CODE_3 --- core/src/OpenMP/Kokkos_OpenMP.hpp | 50 +++------------------- core/src/OpenMP/Kokkos_OpenMP_Instance.hpp | 49 ++++++++++++++++++--- 2 files changed, 50 insertions(+), 49 deletions(-) diff --git a/core/src/OpenMP/Kokkos_OpenMP.hpp b/core/src/OpenMP/Kokkos_OpenMP.hpp index ded3e0b5cb..897554f8f2 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -53,7 +53,12 @@ namespace Kokkos { namespace Impl { class OpenMPInternal; -} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 +// FIXME_OPENMP we can remove this after we remove partition_master +inline thread_local OpenMPInternal* t_openmp_instance = nullptr; +#endif +} // namespace Impl /// \class OpenMP /// \brief Kokkos device for multicore processors in the host memory space. @@ -192,49 +197,6 @@ int OpenMP::impl_hardware_thread_id() noexcept { KOKKOS_IF_ON_DEVICE((return -1;)) } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -template -KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions, - int partition_size) { -#if _OPENMP >= 201511 - if (omp_get_max_active_levels() > 1) { -#else - if (omp_get_nested()) { -#endif - using Exec = Impl::OpenMPInternal; - - Exec* prev_instance = &Impl::OpenMPInternal::singleton(); - - Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions, - partition_size); - - OpenMP::memory_space space; - -#pragma omp parallel num_threads(num_partitions) - { - Exec thread_local_instance(partition_size); - Impl::t_openmp_instance = &thread_local_instance; - - size_t pool_reduce_bytes = 32 * partition_size; - size_t team_reduce_bytes = 32 * partition_size; - size_t team_shared_bytes = 1024 * partition_size; - size_t thread_local_bytes = 1024; - - thread_local_instance.resize_thread_data( - pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, - thread_local_bytes); - - omp_set_num_threads(partition_size); - f(omp_get_thread_num(), omp_get_num_threads()); - Impl::t_openmp_instance = nullptr; - } - } else { - // nested openmp not enabled - f(0, 1); - } -} -#endif - namespace Tools { namespace Experimental { template <> diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index 08fc66a1a2..e6a6087331 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -47,11 +47,6 @@ class OpenMPInternal; inline int g_openmp_hardware_max_threads = 1; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -// FIXME_OPENMP we can remove this after we remove partition_master -inline thread_local OpenMPInternal* t_openmp_instance = nullptr; -#endif - struct OpenMPTraits { static int constexpr MAX_THREAD_COUNT = 512; }; @@ -195,6 +190,50 @@ std::vector partition_space(OpenMP const& main_instance, return Impl::create_OpenMP_instances(main_instance, weights); } } // namespace Experimental + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 +template +KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions, + int partition_size) { +#if _OPENMP >= 201511 + if (omp_get_max_active_levels() > 1) { +#else + if (omp_get_nested()) { +#endif + using Exec = Impl::OpenMPInternal; + + Exec* prev_instance = &Impl::OpenMPInternal::singleton(); + + Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions, + partition_size); + + OpenMP::memory_space space; + +#pragma omp parallel num_threads(num_partitions) + { + Exec thread_local_instance(partition_size); + Impl::t_openmp_instance = &thread_local_instance; + + size_t pool_reduce_bytes = 32 * partition_size; + size_t team_reduce_bytes = 32 * partition_size; + size_t team_shared_bytes = 1024 * partition_size; + size_t thread_local_bytes = 1024; + + thread_local_instance.resize_thread_data( + pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, + thread_local_bytes); + + omp_set_num_threads(partition_size); + f(omp_get_thread_num(), omp_get_num_threads()); + Impl::t_openmp_instance = nullptr; + } + } else { + // nested openmp not enabled + f(0, 1); + } +} +#endif + } // namespace Kokkos #endif From e5e87423d8330c6990ba7cba91dc3478145f95ee Mon Sep 17 00:00:00 2001 From: Dong Hun Lee Date: Wed, 11 Jan 2023 17:50:16 -0700 Subject: [PATCH 047/496] Added missing enable_ifs to hpx team parallel_reduce --- core/src/Kokkos_HPX.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/core/src/Kokkos_HPX.hpp b/core/src/Kokkos_HPX.hpp index 085d829919..7b9d4f219f 100644 --- a/core/src/Kokkos_HPX.hpp +++ b/core/src/Kokkos_HPX.hpp @@ -2177,7 +2177,8 @@ KOKKOS_INLINE_FUNCTION void parallel_for( * The range i=0..N-1 is mapped to all threads of the the calling thread team * and a summation of val is performed and put into result. */ -template +template ::value>> KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct &loop_boundaries, @@ -2214,7 +2215,8 @@ KOKKOS_INLINE_FUNCTION void parallel_for( * The range i=0..N-1 is mapped to all vector lanes of the the calling thread * and a summation of val is performed and put into result. */ -template +template ::value>> KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct &loop_boundaries, @@ -2229,7 +2231,8 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( } } -template +template ::value>> KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct &loop_boundaries, @@ -2241,7 +2244,8 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( } } -template +template ::value>> KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct &loop_boundaries, From a9c997cea68ff016c3a28d2ab1bddf67f9bf4d48 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 12 Jan 2023 23:03:51 +0000 Subject: [PATCH 048/496] Fix ScratchSpace pointer comparison for SYCL --- core/src/Kokkos_ScratchSpace.hpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/core/src/Kokkos_ScratchSpace.hpp b/core/src/Kokkos_ScratchSpace.hpp index f6e7ccec49..82d0b5aee4 100644 --- a/core/src/Kokkos_ScratchSpace.hpp +++ b/core/src/Kokkos_ScratchSpace.hpp @@ -100,7 +100,6 @@ class ScratchMemorySpace { int level = -1) const { if (level == -1) level = m_default_level; auto& m_iter = (level == 0) ? m_iter_L0 : m_iter_L1; - auto& m_end = (level == 0) ? m_end_L0 : m_end_L1; auto m_iter_old = m_iter; if constexpr (alignment_requested) { const ptrdiff_t missalign = size_t(m_iter) % alignment; @@ -113,7 +112,13 @@ class ScratchMemorySpace { void* tmp = m_iter + m_offset * size; ptrdiff_t increment = size * m_multiplier; - if (increment > m_end - m_iter) { + // Cast to uintptr_t to avoid problems with pointer arithmetic using SYCL + const auto end_iter = + reinterpret_cast((level == 0) ? m_end_L0 : m_end_L1); + auto current_iter = reinterpret_cast(m_iter); + auto capacity = end_iter - current_iter; + + if (increment > capacity) { // Request did overflow: return nullptr and reset m_iter m_iter = m_iter_old; tmp = nullptr; @@ -124,7 +129,7 @@ class ScratchMemorySpace { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "ScratchMemorySpace<...>::get_shmem: Failed to allocate " "%ld byte(s); remaining capacity is %ld byte(s)\n", - long(size), long(m_end - m_iter)); + long(size), long(capacity)); #endif // KOKKOS_ENABLE_DEBUG } else { m_iter += increment; From 45acff308dc3e23766a23a70dcb1a2a4eb738e0b Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Thu, 12 Jan 2023 19:25:08 -0500 Subject: [PATCH 049/496] Fix reviewers' comments --- core/src/HIP/Kokkos_HIP_Instance.cpp | 42 +++++++++++++++++++ core/src/HIP/Kokkos_HIP_Instance.hpp | 52 ++---------------------- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 5 ++- 3 files changed, 49 insertions(+), 50 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 5bf3c868d3..28c9c1cb6a 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -236,6 +236,48 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { return m_scratchFlags; } +Kokkos::HIP::size_type *HIPInternal::stage_functor_for_execution( + void const *driver, std::size_t const size) const { + if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { + m_scratchFunctorSize = size; + + using Record = Kokkos::Impl::SharedAllocationRecord; + using RecordHost = + Kokkos::Impl::SharedAllocationRecord; + + if (m_scratchFunctor) { + Record::decrement(Record::get_record(m_scratchFunctor)); + RecordHost::decrement(RecordHost::get_record(m_scratchFunctorHost)); + } + + Record *const r = + Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor", + m_scratchFunctorSize); + RecordHost *const r_host = RecordHost::allocate( + Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost", + m_scratchFunctorSize); + + Record::increment(r); + RecordHost::increment(r_host); + + m_scratchFunctor = reinterpret_cast(r->data()); + m_scratchFunctorHost = reinterpret_cast(r_host->data()); + } + + // When using HSA_XNACK=1, it is necessary to copy the driver to the host to + // ensure that the driver is not destroyed before the computation is done. + // Without this fix, all the atomic tests fail. It is not obvious that this + // problem is limited to HSA_XNACK=1 even if all the tests pass when + // HSA_XNACK=0. That's why we always copy the driver. + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); + std::memcpy(m_scratchFunctorHost, driver, size); + KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyAsync(m_scratchFunctor, + m_scratchFunctorHost, size, + hipMemcpyDefault, m_stream)); + + return m_scratchFunctor; +} + int HIPInternal::acquire_team_scratch_space() { int current_team_scratch = 0; int zero = 0; diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp index 505a63b5b3..ba4bab192f 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -135,10 +135,10 @@ class HIPInternal { HIPInternal() = default; // Resizing of reduction related scratch spaces - size_type *scratch_space(const std::size_t size); - size_type *scratch_flags(const std::size_t size); - template - size_type *scratch_functor(DriverType const &driver) const; + size_type *scratch_space(std::size_t const size); + size_type *scratch_flags(std::size_t const size); + size_type *stage_functor_for_execution(void const *driver, + std::size_t const size) const; uint32_t impl_get_instance_id() const noexcept; int acquire_team_scratch_space(); // Resizing of team level 1 scratch @@ -147,50 +147,6 @@ class HIPInternal { void release_team_scratch_space(int scratch_pool_id); }; -template -Kokkos::HIP::size_type *HIPInternal::scratch_functor( - DriverType const &driver) const { - std::size_t size = sizeof(DriverType); - if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { - m_scratchFunctorSize = size; - - using Record = Kokkos::Impl::SharedAllocationRecord; - using RecordHost = - Kokkos::Impl::SharedAllocationRecord; - - if (m_scratchFunctor) { - Record::decrement(Record::get_record(m_scratchFunctor)); - RecordHost::decrement(RecordHost::get_record(m_scratchFunctorHost)); - } - - Record *const r = - Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor", - m_scratchFunctorSize); - RecordHost *const r_host = RecordHost::allocate( - Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost", - m_scratchFunctorSize); - - Record::increment(r); - RecordHost::increment(r_host); - - m_scratchFunctor = reinterpret_cast(r->data()); - m_scratchFunctorHost = reinterpret_cast(r_host->data()); - } - - // When using HSA_XNACK=1, it is necessary to copy the driver to the host to - // ensure that the driver is not destroyed before the computation is done. - // Without this fix, all the atomic tests fail. It is not obvious that this - // problem is limited to HSA_XNACK=1 even if all the tests pass when - // HSA_XNACK=0. That's why we always copy the driver. - KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); - std::memcpy(m_scratchFunctorHost, &driver, size); - KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyAsync(m_scratchFunctor, - m_scratchFunctorHost, size, - hipMemcpyDefault, m_stream)); - - return m_scratchFunctor; -} - } // namespace Impl namespace Experimental { diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index f4f221e276..f654347552 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -377,8 +377,9 @@ struct HIPParallelLaunchKernelInvoker(hip_instance->scratch_functor(driver)); + DriverType *driver_ptr = reinterpret_cast( + hip_instance->stage_functor_for_execution( + reinterpret_cast(&driver), sizeof(DriverType))); (base_t::get_kernel_func())<<m_stream>>>( driver_ptr); } From 51aa90411b72aae25f4aba3c2f43145905829be3 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 13 Jan 2023 08:33:25 -0500 Subject: [PATCH 050/496] Desul atomics configure library based what the user enabled --- core/src/CMakeLists.txt | 19 ++++++++++ core/src/Kokkos_Atomics_Desul_Config.hpp | 4 -- tpls/desul/Config.hpp.cmake.in | 17 +++++++++ tpls/desul/include/desul/atomics/Macros.hpp | 42 ++++++++++++--------- 4 files changed, 61 insertions(+), 21 deletions(-) create mode 100644 tpls/desul/Config.hpp.cmake.in diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index 165c8b92b5..862c0c47dd 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -4,6 +4,22 @@ KOKKOS_INCLUDE_DIRECTORIES( ${KOKKOS_TOP_BUILD_DIR} ) IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND NOT desul_FOUND) + IF(KOKKOS_ENABLE_CUDA) + SET(DESUL_ATOMICS_ENABLE_CUDA ON) + ENDIF() + IF(KOKKOS_ENABLE_HIP) + SET(DESUL_ATOMICS_ENABLE_HIP ON) + ENDIF() + IF(KOKKOS_ENABLE_SYCL) + SET(DESUL_ATOMICS_ENABLE_SYCL ON) + ENDIF() + IF(KOKKOS_ENABLE_OPENMPTARGET) + SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP + ENDIF() + CONFIGURE_FILE( + ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp + ) KOKKOS_INCLUDE_DIRECTORIES( ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include ) @@ -78,15 +94,18 @@ IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND NOT desul_FOUND) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*.hpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*/*.hpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/*/*/*.inc*) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) INSTALL (DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul" + "${CMAKE_CURRENT_BINARY_DIR}/desul" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.inc" PATTERN "*.inc_*" PATTERN "*.hpp" ) + MESSAGE(STATUS "Using internal desul_atomics copy") ELSE() MESSAGE(STATUS "Using external desul_atomics install found at:") diff --git a/core/src/Kokkos_Atomics_Desul_Config.hpp b/core/src/Kokkos_Atomics_Desul_Config.hpp index e4b83b276e..4cf170f5f1 100644 --- a/core/src/Kokkos_Atomics_Desul_Config.hpp +++ b/core/src/Kokkos_Atomics_Desul_Config.hpp @@ -23,10 +23,6 @@ static_assert(false, #include -#ifdef KOKKOS_ENABLE_OPENMPTARGET -#define DESUL_HAVE_OPENMP_ATOMICS -#endif - #if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) #define DESUL_CUDA_ARCH_IS_PRE_PASCAL #endif diff --git a/tpls/desul/Config.hpp.cmake.in b/tpls/desul/Config.hpp.cmake.in new file mode 100644 index 0000000000..40ab5c1c6c --- /dev/null +++ b/tpls/desul/Config.hpp.cmake.in @@ -0,0 +1,17 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_CONFIG_HPP_ +#define DESUL_ATOMICS_CONFIG_HPP_ + +#cmakedefine DESUL_ATOMICS_ENABLE_CUDA +#cmakedefine DESUL_ATOMICS_ENABLE_HIP +#cmakedefine DESUL_ATOMICS_ENABLE_SYCL +#cmakedefine DESUL_ATOMICS_ENABLE_OPENMP + +#endif diff --git a/tpls/desul/include/desul/atomics/Macros.hpp b/tpls/desul/include/desul/atomics/Macros.hpp index 36d2985323..992fb9fa66 100644 --- a/tpls/desul/include/desul/atomics/Macros.hpp +++ b/tpls/desul/include/desul/atomics/Macros.hpp @@ -9,32 +9,39 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_MACROS_HPP_ #define DESUL_ATOMICS_MACROS_HPP_ +#include + // Macros -// ONLY use GNUC atomics if not explicitly say to use OpenMP atomics -#if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(__GNUC__) -#define DESUL_HAVE_GCC_ATOMICS +#if defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDACC__) +#define DESUL_HAVE_CUDA_ATOMICS #endif -// Equivalent to above for MSVC atomics -#if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(_MSC_VER) -#define DESUL_HAVE_MSVC_ATOMICS +#if defined(DESUL_ATOMICS_ENABLE_HIP) && defined(__HIPCC__) +#define DESUL_HAVE_HIP_ATOMICS #endif -#ifdef __CUDACC__ -#define DESUL_HAVE_CUDA_ATOMICS +#if defined(DESUL_ATOMICS_ENABLE_SYCL) && defined(SYCL_LANGUAGE_VERSION) +#define DESUL_HAVE_SYCL_ATOMICS #endif -#ifdef __HIPCC__ -#define DESUL_HAVE_HIP_ATOMICS +#if defined(DESUL_ATOMICS_ENABLE_OPENMP) +#define DESUL_HAVE_OPENMP_ATOMICS #endif -#ifdef SYCL_LANGUAGE_VERSION -#define DESUL_HAVE_SYCL_ATOMICS +// ONLY use GNUC atomics if not explicitly say to use OpenMP atomics +#if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(__GNUC__) +#define DESUL_HAVE_GCC_ATOMICS +#endif + +// Equivalent to above for MSVC atomics +#if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(_MSC_VER) +#define DESUL_HAVE_MSVC_ATOMICS #endif -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || \ - defined(__SYCL_DEVICE_ONLY__) +#if (defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDA_ARCH__)) || \ + (defined(DESUL_ATOMICS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \ + (defined(DESUL_ATOMICS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)) #define DESUL_HAVE_GPU_LIKE_PROGRESS #endif @@ -62,7 +69,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #define DESUL_IMPL_ESC_(...) DESUL_IMPL_VAN_##__VA_ARGS__ #define DESUL_IMPL_VAN_DESUL_IMPL_ISH -#if defined(__CUDACC__) && defined(__NVCOMPILER) +#if (defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDACC__)) && defined(__NVCOMPILER) #include #define DESUL_IF_ON_DEVICE(CODE) NV_IF_TARGET(NV_IS_DEVICE, CODE) #define DESUL_IF_ON_HOST(CODE) NV_IF_TARGET(NV_IS_HOST, CODE) @@ -99,8 +106,9 @@ static constexpr bool desul_impl_omp_on_host() { return false; } #endif #if !defined(DESUL_IF_ON_HOST) && !defined(DESUL_IF_ON_DEVICE) -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || \ - defined(__SYCL_DEVICE_ONLY__) +#if (defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDA_ARCH__)) || \ + (defined(DESUL_ATOMICS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \ + (defined(DESUL_ATOMICS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)) #define DESUL_IF_ON_DEVICE(CODE) \ { DESUL_IMPL_STRIP_PARENS(CODE) } #define DESUL_IF_ON_HOST(CODE) \ From 17581963dd7e4529b45d2fb264a329deab723348 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 13 Jan 2023 08:33:53 -0500 Subject: [PATCH 051/496] Generate file from the generated Makefiles --- Makefile.kokkos | 57 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index d0fc1f7e3b..37feada262 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1378,11 +1378,64 @@ endif KOKKOS_EXTRA_LIBS := ${KOKKOS_LIBS} KOKKOS_LIBS := -lkokkos ${KOKKOS_LIBS} +# Generating the header +DESUL_INTERNAL_CONFIG_TMP=Desul_Config.tmp +ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0) + DESUL_CONFIG_HEADER=desul/atomics/Config.hpp +else + DESUL_CONFIG_HEADER=NothingToSeeHereMoveAlong +endif +desul_append_header = $(shell echo $1 >> $(DESUL_INTERNAL_CONFIG_TMP)) +tmp := $(call desul_append_header, "// generated by on-demand build system by crtrott" > $(DESUL_INTERNAL_CONFIG_TMP)) +tmp := $(call desul_append_header, "$H""ifndef DESUL_ATOMICS_CONFIG_HPP_") +tmp := $(call desul_append_header, "$H""define DESUL_ATOMICS_CONFIG_HPP_") +tmp := $(call desul_append_header, "") +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_CUDA") +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_CUDA */") +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_HIP") +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_HIP */") +endif + +ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_SYCL") +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_SYCL */") +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENMP") +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENMP */") +endif +tmp := $(call desul_append_header, "") +tmp := $(call desul_append_header, "$H""endif") + +DESUL_INTERNAL_LS_CONFIG := $(shell ls $(DESUL_CONFIG_HEADER) 2>&1) + +ifeq ($(DESUL_INTERNAL_LS_CONFIG), $(DESUL_CONFIG_HEADER)) + KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff $(DESUL_CONFIG_HEADER) $(DESUL_INTERNAL_CONFIG_TMP) | grep -c define)) +else + DESUL_INTERNAL_NEW_CONFIG := 1 +endif + +ifneq ($(DESUL_INTERNAL_NEW_CONFIG), 0) + tmp := $(shell mkdir -p desul/atomics) + tmp := $(shell cp $(DESUL_INTERNAL_CONFIG_TMP) $(DESUL_CONFIG_HEADER)) +endif + # Setting up dependencies. KokkosCore_config.h: -KOKKOS_CPP_DEPENDS := KokkosCore_config.h $(KOKKOS_HEADERS) +$(DESUL_CONFIG_HEADER): + +KOKKOS_CPP_DEPENDS := $(DESUL_CONFIG_HEADER) KokkosCore_config.h $(KOKKOS_HEADERS) KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o) KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) @@ -1390,7 +1443,7 @@ KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) include $(KOKKOS_PATH)/Makefile.targets kokkos-clean: - rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \ + rm -f $(KOKKOS_OBJ_LINK) $(DESUL_CONFIG_HEADER) $(DESUL_INTERNAL_CONFIG_TMP) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \ KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \ KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp From 20abee9fd6ebe32be12d3b0d812c50cca0e937d2 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 13 Jan 2023 09:00:53 -0500 Subject: [PATCH 052/496] Let increment be of type uintptr_t fixing warning --- core/src/Kokkos_ScratchSpace.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Kokkos_ScratchSpace.hpp b/core/src/Kokkos_ScratchSpace.hpp index 82d0b5aee4..a192b77f28 100644 --- a/core/src/Kokkos_ScratchSpace.hpp +++ b/core/src/Kokkos_ScratchSpace.hpp @@ -110,7 +110,7 @@ class ScratchMemorySpace { // Note: for team scratch m_offset is 0, since every // thread will get back the same shared pointer void* tmp = m_iter + m_offset * size; - ptrdiff_t increment = size * m_multiplier; + uintptr_t increment = size * m_multiplier; // Cast to uintptr_t to avoid problems with pointer arithmetic using SYCL const auto end_iter = From 97287f674abbe0d2a53e2730374a56bc07567b22 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Fri, 13 Jan 2023 09:05:43 -0500 Subject: [PATCH 053/496] Remove unnecessary header Co-authored-by: Damien L-G --- core/src/HIP/Kokkos_HIP_Instance.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp index ba4bab192f..13eedc1055 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -19,7 +19,6 @@ #ifndef KOKKOS_HIP_INSTANCE_HPP #define KOKKOS_HIP_INSTANCE_HPP -#include #include #include From 7a3bfe039e37cb83ede331da04e22d98024c3275 Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Fri, 13 Jan 2023 10:30:56 -0500 Subject: [PATCH 054/496] Fix macro typo used in the OpenACC backend parallel_reduce(MDRange). (#5766) * Fix macro typo used in the OpenACC backend parallel_reduce(MDRange). - Change KOKKOS_IMPL_ACC_PGRAMA to KOKKOS_IMPL_ACC_PRAGMA. - Apply ClangFormat. * Format fixup Co-authored-by: Damien L-G --- .../OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 31f37420a2..c4b7b6bdec 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -138,7 +138,7 @@ class Kokkos::Impl::ParallelReduce, int begin1 = begin[1]; \ int end1 = end[1]; \ /* clang-format off */ \ - KOKKOS_IMPL_ACC_PGRAMA(parallel loop gang vector collapse(2) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(2) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ for (auto i0 = begin0; i0 < end0; ++i0) { \ for (auto i1 = begin1; i1 < end1; ++i1) { \ @@ -192,7 +192,7 @@ class Kokkos::Impl::ParallelReduce, int begin2 = begin[2]; \ int end2 = end[2]; \ /* clang-format off */ \ - KOKKOS_IMPL_ACC_PGRAMA(parallel loop gang vector collapse(3) reduction( \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(3) reduction( \ OPERATOR \ : val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ @@ -254,7 +254,7 @@ class Kokkos::Impl::ParallelReduce, int begin3 = begin[3]; \ int end3 = end[3]; \ /* clang-format off */ \ - KOKKOS_IMPL_ACC_PGRAMA(parallel loop gang vector collapse(4) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(4) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ for (auto i0 = begin0; i0 < end0; ++i0) { \ for (auto i1 = begin1; i1 < end1; ++i1) { \ @@ -322,7 +322,7 @@ class Kokkos::Impl::ParallelReduce, int begin4 = begin[4]; \ int end4 = end[4]; \ /* clang-format off */ \ - KOKKOS_IMPL_ACC_PGRAMA(parallel loop gang vector collapse(5) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(5) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ for (auto i0 = begin0; i0 < end0; ++i0) { \ for (auto i1 = begin1; i1 < end1; ++i1) { \ @@ -398,7 +398,7 @@ class Kokkos::Impl::ParallelReduce, int begin5 = begin[5]; \ int end5 = end[5]; \ /* clang-format off */ \ - KOKKOS_IMPL_ACC_PGRAMA(parallel loop gang vector collapse(6) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(6) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ for (auto i0 = begin0; i0 < end0; ++i0) { \ for (auto i1 = begin1; i1 < end1; ++i1) { \ From 4829fb2e49f38d33071cbc335d1c3c1e4f8590a0 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Fri, 13 Jan 2023 14:19:03 -0500 Subject: [PATCH 055/496] Add a mutex to protect scratchFunctor --- core/src/HIP/Kokkos_HIP_Instance.hpp | 1 + core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp index 13eedc1055..06fab84b56 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -91,6 +91,7 @@ class HIPInternal { size_type *m_scratchFlags = nullptr; mutable size_type *m_scratchFunctor = nullptr; mutable size_type *m_scratchFunctorHost = nullptr; + inline static std::mutex scratchFunctorMutex; hipStream_t m_stream = nullptr; uint32_t m_instance_id = diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index f654347552..9630f027c6 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -377,6 +377,8 @@ struct HIPParallelLaunchKernelInvoker lock(HIPInternal::scratchFunctorMutex); DriverType *driver_ptr = reinterpret_cast( hip_instance->stage_functor_for_execution( reinterpret_cast(&driver), sizeof(DriverType))); From 0db3bd83b4cb0a2ec3613e336a5ab45f0addae6e Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Fri, 13 Jan 2023 14:33:08 -0500 Subject: [PATCH 056/496] Fix a typo Co-authored-by: Daniel Arndt --- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 9630f027c6..8bf5d7f394 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -377,7 +377,7 @@ struct HIPParallelLaunchKernelInvoker lock(HIPInternal::scratchFunctorMutex); DriverType *driver_ptr = reinterpret_cast( hip_instance->stage_functor_for_execution( From 910d43e45b04fb0e7d155233f7069cab0aeefc79 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Tue, 17 Jan 2023 11:28:36 -0800 Subject: [PATCH 057/496] OpenMP: Adding an ifdef around chunksize for static schedule for GCC compiler. --- core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp index 8e3e97b7a2..50d995a061 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp @@ -101,8 +101,15 @@ class ParallelFor, Kokkos::OpenMP> { std::enable_if_t::value> execute_parallel() const { +// Specifying an chunksize with GCC compiler leads to performance regression +// with static schedule. +#ifdef KOKKOS_COMPILER_GNU +#pragma omp parallel for schedule(static) \ + num_threads(m_instance->thread_pool_size()) +#else #pragma omp parallel for schedule(static KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE) \ num_threads(m_instance->thread_pool_size()) +#endif KOKKOS_PRAGMA_IVDEP_IF_ENABLED for (auto iwork = m_policy.begin(); iwork < m_policy.end(); ++iwork) { exec_work(m_functor, iwork); From 619ed2d26aaaf856495788b4ec113e59cab2ac82 Mon Sep 17 00:00:00 2001 From: Christoph Junghans Date: Wed, 18 Jan 2023 08:52:56 -0700 Subject: [PATCH 058/496] Fix build on Fedora rawhise uint32_t now gets defined in cstdint --- core/src/impl/Kokkos_MemoryPool.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/impl/Kokkos_MemoryPool.cpp b/core/src/impl/Kokkos_MemoryPool.cpp index fc5c355f3c..ec004a36da 100644 --- a/core/src/impl/Kokkos_MemoryPool.cpp +++ b/core/src/impl/Kokkos_MemoryPool.cpp @@ -22,6 +22,7 @@ #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- From 7652228ed1b3e59336856bc1a1c576d4cda7a658 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 28 Nov 2022 13:41:46 +0100 Subject: [PATCH 059/496] Use `flang-new` for Fedora builds --- .github/workflows/continuous-integration-workflow.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 7f88532209..c797c8b17b 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -76,12 +76,8 @@ jobs: - name: maybe_disable_death_tests if: ${{ matrix.distro == 'fedora:rawhide' }} run: echo "GTEST_FILTER=-*DeathTest*" >> $GITHUB_ENV -# Re-enable when latest is F37+ -# - name: maybe_use_flang -# if: ${{ matrix.cxx == 'clang++' && startsWith(matrix.distro,'fedora:') }} -# run: echo "FC=flang" >> $GITHUB_ENV - name: maybe_use_flang_new - if: ${{ matrix.cxx == 'clang++' && startsWith(matrix.distro,'fedora:rawhide') }} + if: ${{ matrix.cxx == 'clang++' && startsWith(matrix.distro,'fedora:') }} run: echo "FC=flang-new" >> $GITHUB_ENV - name: maybe_use_external_gtest if: ${{ matrix.distro == 'ubuntu:latest' }} From 4bd3e85884881934b6e5481e591c75b6d469a843 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 18 Jan 2023 19:49:50 +0100 Subject: [PATCH 060/496] Upgrade GitHub actions --- .github/workflows/continuous-integration-workflow-hpx.yml | 6 +++--- .github/workflows/continuous-integration-workflow.yml | 6 +++--- .github/workflows/osx.yml | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/continuous-integration-workflow-hpx.yml b/.github/workflows/continuous-integration-workflow-hpx.yml index 35bb5bb2cb..ef316b014b 100644 --- a/.github/workflows/continuous-integration-workflow-hpx.yml +++ b/.github/workflows/continuous-integration-workflow-hpx.yml @@ -13,7 +13,7 @@ jobs: steps: - name: checkout code - uses: actions/checkout@v2.2.0 + uses: actions/checkout@v3 with: path: kokkos - name: setup hpx dependencies @@ -26,12 +26,12 @@ jobs: libboost-all-dev \ ninja-build - name: checkout hpx - uses: actions/checkout@v2.2.0 + uses: actions/checkout@v3 with: repository: STELLAR-GROUP/hpx ref: 1.7.1 path: hpx - - uses: actions/cache@v2 + - uses: actions/cache@v3 id: cache-hpx with: path: ./hpx/install diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 7f88532209..f5d54fbf3a 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -52,7 +52,7 @@ jobs: options: --security-opt seccomp=unconfined steps: - name: Checkout desul - uses: actions/checkout@v2.2.0 + uses: actions/checkout@v3 with: repository: desul/desul ref: 477da9c8f40f8db369c28dd3f93a67e376d8511b @@ -67,8 +67,8 @@ jobs: cmake -DDESUL_ENABLE_TESTS=OFF -DCMAKE_INSTALL_PREFIX=/usr/desul-install .. sudo cmake --build . --target install --parallel 2 - name: Checkout code - uses: actions/checkout@v2.2.0 - - uses: actions/cache@v2 + uses: actions/checkout@v3 + - uses: actions/cache@v3 with: path: ~/.ccache key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${github.ref}-${{ github.sha }} diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index dae8343f20..03fbcf37f6 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -24,7 +24,7 @@ jobs: cmake_build_type: "Release" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: configure run: cmake -B build . From 20b609a9f512017819bfbe971aa2133a376c6443 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 19 Jan 2023 14:19:30 -0600 Subject: [PATCH 061/496] sprintf -> snprintf --- core/perf_test/test_atomic.cpp | 2 +- core/unit_test/TestDefaultDeviceTypeInit.hpp | 24 ++++++++++++-------- core/unit_test/TestSharedAlloc.hpp | 4 ++-- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/core/perf_test/test_atomic.cpp b/core/perf_test/test_atomic.cpp index c20e5e9433..5f10afc45a 100644 --- a/core/perf_test/test_atomic.cpp +++ b/core/perf_test/test_atomic.cpp @@ -45,7 +45,7 @@ void textcolor(int attr, int fg, int bg) { char command[40]; /* Command is the control command to the terminal */ - sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40); + snprintf(command, 40, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40); printf("%s", command); } void textcolor_standard() { textcolor(RESET, BLACK, WHITE); } diff --git a/core/unit_test/TestDefaultDeviceTypeInit.hpp b/core/unit_test/TestDefaultDeviceTypeInit.hpp index c1ded4e43e..7ae73b14d3 100644 --- a/core/unit_test/TestDefaultDeviceTypeInit.hpp +++ b/core/unit_test/TestDefaultDeviceTypeInit.hpp @@ -41,9 +41,10 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device, nargs = (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + (do_device ? 1 : 0) + (do_other ? 4 : 0) + (do_tune ? 1 : 0); - char** args_kokkos = new char*[nargs]; + char** args_kokkos = new char*[nargs]; + const int max_args_size = 45; for (int i = 0; i < nargs; i++) { - args_kokkos[i] = new char[45]; + args_kokkos[i] = new char[max_args_size]; delete_these.insert(args_kokkos[i]); } @@ -84,7 +85,7 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device, #endif init_args.num_threads = nthreads; - sprintf(args_kokkos[threads_idx], "--threads=%i", nthreads); + snprintf(args_kokkos[threads_idx], max_args_size, "--threads=%i", nthreads); } if (do_numa) { @@ -102,24 +103,27 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device, #endif init_args.num_numa = numa; - sprintf(args_kokkos[numa_idx], "--numa=%i", numa); + snprintf(args_kokkos[numa_idx], max_args_size, "--numa=%i", numa); } if (do_device) { init_args.device_id = 0; - sprintf(args_kokkos[device_idx], "--device-id=%i", 0); + snprintf(args_kokkos[device_idx], max_args_size, "--device-id=%i", 0); } if (do_other) { - sprintf(args_kokkos[0], "--dummyarg=1"); - sprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], "--dummy2arg"); - sprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], "dummy3arg"); - sprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], "dummy4arg=1"); + snprintf(args_kokkos[0], max_args_size, "--dummyarg=1"); + snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], max_args_size, + "--dummy2arg"); + snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], max_args_size, + "dummy3arg"); + snprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], max_args_size, + "dummy4arg=1"); } if (do_tune) { init_args.tune_internals = true; - sprintf(args_kokkos[tune_idx], "--kokkos-tune-internals"); + snprintf(args_kokkos[tune_idx], max_args_size, "--kokkos-tune-internals"); } return args_kokkos; diff --git a/core/unit_test/TestSharedAlloc.hpp b/core/unit_test/TestSharedAlloc.hpp index 986c03fbfc..c7b0f38023 100644 --- a/core/unit_test/TestSharedAlloc.hpp +++ b/core/unit_test/TestSharedAlloc.hpp @@ -63,7 +63,7 @@ void test_shared_alloc() { // Since always executed on host space, leave [=] Kokkos::parallel_for(range, [=](int i) { char name[64]; - sprintf(name, "test_%.2d", i); + snprintf(name, 64, "test_%.2d", i); r[i] = RecordMemS::allocate(s, name, size * (i + 1)); h[i] = Header::get_header(r[i]->data()); @@ -107,7 +107,7 @@ void test_shared_alloc() { Kokkos::parallel_for(range, [=](size_t i) { char name[64]; - sprintf(name, "test_%.2d", int(i)); + snprintf(name, 64, "test_%.2d", int(i)); RecordFull* rec = RecordFull::allocate(s, name, size * (i + 1)); From 7f08b95a899116d01ac73d8f13f1c1ceeb14a953 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 20 Jan 2023 15:35:53 -0600 Subject: [PATCH 062/496] Desul atomics cleanup remove unused Impl::eliminate_warning_for_lock_array() --- tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp | 1 - tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp | 1 - 2 files changed, 2 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp b/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp index e0e4e129ac..6984ae34a7 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp @@ -128,7 +128,6 @@ namespace desul { namespace Impl { namespace { static int lock_array_copied = 0; -inline int eliminate_warning_for_lock_array() { return lock_array_copied; } } // namespace #ifdef __CUDACC_RDC__ diff --git a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp index 1ab9544eb4..1ce7673225 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp @@ -131,7 +131,6 @@ namespace desul { namespace Impl { namespace { static int lock_array_copied = 0; -inline int eliminate_warning_for_lock_array() { return lock_array_copied; } } // namespace } // namespace Impl } // namespace desul From 9f09e2b1700a130743559d2bd0c300aab951d084 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 20 Jan 2023 15:36:46 -0600 Subject: [PATCH 063/496] Drop unused Kokkos::Impl::eliminate_warning_for_lock_array CUDA/HIP functions --- core/src/Cuda/Kokkos_Cuda_Locks.hpp | 1 - core/src/HIP/Kokkos_HIP_Locks.hpp | 1 - 2 files changed, 2 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/core/src/Cuda/Kokkos_Cuda_Locks.hpp index 3916ae2c53..4c17997fe2 100644 --- a/core/src/Cuda/Kokkos_Cuda_Locks.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Locks.hpp @@ -119,7 +119,6 @@ namespace Kokkos { namespace Impl { namespace { static int lock_array_copied = 0; -inline int eliminate_warning_for_lock_array() { return lock_array_copied; } } // namespace #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE diff --git a/core/src/HIP/Kokkos_HIP_Locks.hpp b/core/src/HIP/Kokkos_HIP_Locks.hpp index fbed4afd3f..0d46d74074 100644 --- a/core/src/HIP/Kokkos_HIP_Locks.hpp +++ b/core/src/HIP/Kokkos_HIP_Locks.hpp @@ -111,7 +111,6 @@ namespace Kokkos { namespace Impl { namespace { static int lock_array_copied = 0; -inline int eliminate_warning_for_lock_array() { return lock_array_copied; } } // namespace } // namespace Impl } // namespace Kokkos From 7475b8929714760678d3455b132c827f47a0750a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Sun, 22 Jan 2023 12:10:38 -0500 Subject: [PATCH 064/496] Remove dead OpenMP test source file --- core/unit_test/openmp/TestOpenMP.hpp | 80 ---------------------------- 1 file changed, 80 deletions(-) delete mode 100644 core/unit_test/openmp/TestOpenMP.hpp diff --git a/core/unit_test/openmp/TestOpenMP.hpp b/core/unit_test/openmp/TestOpenMP.hpp deleted file mode 100644 index 3a974d517c..0000000000 --- a/core/unit_test/openmp/TestOpenMP.hpp +++ /dev/null @@ -1,80 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_TEST_OPENMP_HPP -#define KOKKOS_TEST_OPENMP_HPP - -#include - -#include - -#ifdef KOKKOS_LAMBDA -#undef KOKKOS_LAMBDA -#endif -#define KOKKOS_LAMBDA [=] - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace Test { - -class openmp : public ::testing::Test { - protected: - static void SetUpTestCase() { - int threads_count = 0; -#pragma omp parallel - { -#pragma omp atomic - ++threads_count; - } - - if (threads_count > 3) { - threads_count /= 2; - } - - Kokkos::OpenMP::initialize(threads_count); - Kokkos::print_configuration(std::cout, true); - - srand(10231); - } - - static void TearDownTestCase() { Kokkos::OpenMP::finalize(); } -}; - -} // namespace Test - -#endif From aa7865e84be74f1c54f03eae90685acb1dd8aac8 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Sun, 22 Jan 2023 12:11:35 -0500 Subject: [PATCH 065/496] Remove unused OpenMPTarget test source file --- .../openmptarget/TestOpenMPTarget.hpp | 79 ------------------- 1 file changed, 79 deletions(-) delete mode 100644 core/unit_test/openmptarget/TestOpenMPTarget.hpp diff --git a/core/unit_test/openmptarget/TestOpenMPTarget.hpp b/core/unit_test/openmptarget/TestOpenMPTarget.hpp deleted file mode 100644 index 6ae45620f2..0000000000 --- a/core/unit_test/openmptarget/TestOpenMPTarget.hpp +++ /dev/null @@ -1,79 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_TEST_OPENMPTARGET_HPP -#define KOKKOS_TEST_OPENMPTARGET_HPP - -#include - -#include - -#ifdef KOKKOS_LAMBDA -#undef KOKKOS_LAMBDA -#endif -#define KOKKOS_LAMBDA [=] - -#include - -//#include -//#include -//#include -//#include -//#include -//#include -#include -#include -//#include -//#include -//#include -//#include - -// TODO enable task scheduler tests for openmptarget -//#include - -//#include -//#include -//#include -#include -//#include -//#include - -namespace Test { - -class openmptarget : public ::testing::Test { - protected: - static void SetUpTestCase() { - const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = - Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned openmptarget_per_core = - Kokkos::hwloc::get_available_openmptarget_per_core(); - - unsigned openmptarget_count = 0; - - openmptarget_count = std::max(1u, numa_count) * - std::max(2u, cores_per_numa * openmptarget_per_core); - - Kokkos::OpenMPTarget::initialize(openmptarget_count); - Kokkos::print_configuration(std::cout, true /* detailed */); - } - - static void TearDownTestCase() { Kokkos::OpenMPTarget::finalize(); } -}; - -} // namespace Test - -#endif From 73b4ca835c08e23eac67979962e9b836fbe4d6b9 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 13 Jan 2023 15:04:35 +0000 Subject: [PATCH 066/496] Prefer ASSERT_EQ over ASSERT_TRUE with == --- core/unit_test/TestStackTrace.hpp | 3 +-- core/unit_test/TestTeam.hpp | 4 ++-- core/unit_test/TestViewSubview.hpp | 12 ++++++------ core/unit_test/tools/TestEventCorrectness.hpp | 4 ++-- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/core/unit_test/TestStackTrace.hpp b/core/unit_test/TestStackTrace.hpp index f5a0b95a02..378188c7e3 100644 --- a/core/unit_test/TestStackTrace.hpp +++ b/core/unit_test/TestStackTrace.hpp @@ -60,8 +60,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) { if (bDynamic) { std::string foutput = sstream.str(); printf("demangled test_f1: %s \n", foutput.c_str()); - ASSERT_TRUE(std::string::npos != - foutput.find("Test::stacktrace_test_f1")); + ASSERT_NE(std::string::npos, foutput.find("Test::stacktrace_test_f1")); for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2", "stacktrace_test_f3", "stacktrace_test_f4"}) { ASSERT_EQ(std::string::npos, foutput.find(x)); diff --git a/core/unit_test/TestTeam.hpp b/core/unit_test/TestTeam.hpp index 0f86f9f369..fc01e9caab 100644 --- a/core/unit_test/TestTeam.hpp +++ b/core/unit_test/TestTeam.hpp @@ -1582,7 +1582,7 @@ struct TestScratchAlignment { Kokkos::fence(); int minimal_scratch_allocation_failed = 0; Kokkos::deep_copy(minimal_scratch_allocation_failed, flag); - ASSERT_TRUE(minimal_scratch_allocation_failed == 0); + ASSERT_EQ(minimal_scratch_allocation_failed, 0); } // test alignment of successive allocations @@ -1650,7 +1650,7 @@ struct TestScratchAlignment { Kokkos::fence(); int raw_get_shmem_alignment_failed = 0; Kokkos::deep_copy(raw_get_shmem_alignment_failed, flag); - ASSERT_TRUE(raw_get_shmem_alignment_failed == 0); + ASSERT_EQ(raw_get_shmem_alignment_failed, 0); } }; diff --git a/core/unit_test/TestViewSubview.hpp b/core/unit_test/TestViewSubview.hpp index edd5c8036f..21086eb3d7 100644 --- a/core/unit_test/TestViewSubview.hpp +++ b/core/unit_test/TestViewSubview.hpp @@ -474,8 +474,8 @@ void test_left_1(bool use_constr) { for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1) for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2) for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) { - ASSERT_TRUE(&sx4(i0, i1, i2, i3) == - &x8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3)); + ASSERT_EQ(&sx4(i0, i1, i2, i3), + &x8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3)); } } } @@ -546,8 +546,8 @@ void test_left_2() { for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1) for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2) for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) { - ASSERT_TRUE(&sx4(i0, i1, i2, i3) == - &x4(1 + i0, 1 + i1, 0 + i2, 2 + i3)); + ASSERT_EQ(&sx4(i0, i1, i2, i3), + &x4(1 + i0, 1 + i1, 0 + i2, 2 + i3)); } } } @@ -756,8 +756,8 @@ void test_right_1(bool use_constr) { for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1) for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2) for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) { - ASSERT_TRUE(&sx4(i0, i1, i2, i3) == - &x8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3)); + ASSERT_EQ(&sx4(i0, i1, i2, i3), + &x8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3)); } } } diff --git a/core/unit_test/tools/TestEventCorrectness.hpp b/core/unit_test/tools/TestEventCorrectness.hpp index 4081604113..ec8cc45a38 100644 --- a/core/unit_test/tools/TestEventCorrectness.hpp +++ b/core/unit_test/tools/TestEventCorrectness.hpp @@ -197,7 +197,7 @@ TEST(kokkosp, test_multiple_default_instances) { ex1.fence("named_instance_fence_one"); ex2.fence("named_instance_fence_two"); }); - ASSERT_TRUE(found_payloads[0].dev_id == found_payloads[1].dev_id); + ASSERT_EQ(found_payloads[0].dev_id, found_payloads[1].dev_id); }); } @@ -716,7 +716,7 @@ TEST(kokkosp, get_events) { }); for (const auto& ptr : event_vector) { auto ptr_as_begin = std::dynamic_pointer_cast(ptr); - ASSERT_TRUE(ptr_as_begin == nullptr); + ASSERT_EQ(ptr_as_begin, nullptr); } } } // namespace Test From 24ef794e945e35e5a682ad88f3f5b3c0f0bc55b2 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Sun, 22 Jan 2023 16:17:23 -0500 Subject: [PATCH 067/496] Fixup warning in Jenkins CI build with GNU generated makefile --- .jenkins | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins b/.jenkins index 1775a57d3b..b9ece2dce2 100644 --- a/.jenkins +++ b/.jenkins @@ -343,7 +343,7 @@ pipeline { --with-cuda \ --with-cuda-options=enable_lambda \ --arch=Volta70 \ - .. && \ + && \ make test -j8''' } post { From de26b23c5a0971eb158734c5c7a68ff69873c16e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 23 Jan 2023 08:13:31 -0500 Subject: [PATCH 068/496] Add missing ReductionIdentity specialization --- core/src/Kokkos_ReductionIdentity.hpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/core/src/Kokkos_ReductionIdentity.hpp b/core/src/Kokkos_ReductionIdentity.hpp index 4b37d73a54..a6d6eb3232 100644 --- a/core/src/Kokkos_ReductionIdentity.hpp +++ b/core/src/Kokkos_ReductionIdentity.hpp @@ -53,6 +53,30 @@ type { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom land reduction type"); return T(); } };*/ +template <> +struct reduction_identity { + KOKKOS_FORCEINLINE_FUNCTION constexpr static char sum() { + return static_cast(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static char prod() { + return static_cast(1); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static char max() { return CHAR_MIN; } + KOKKOS_FORCEINLINE_FUNCTION constexpr static char min() { return CHAR_MAX; } + KOKKOS_FORCEINLINE_FUNCTION constexpr static char bor() { + return static_cast(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static char band() { + return ~static_cast(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static char lor() { + return static_cast(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static char land() { + return static_cast(1); + } +}; + template <> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char sum() { From e8c08e2c0cf37786a65574b030679cceb7658108 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 12 Jan 2023 23:03:51 +0000 Subject: [PATCH 069/496] Fix sycl.scratch_align test --- core/unit_test/TestTeam.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestTeam.hpp b/core/unit_test/TestTeam.hpp index fc01e9caab..402441b725 100644 --- a/core/unit_test/TestTeam.hpp +++ b/core/unit_test/TestTeam.hpp @@ -1642,9 +1642,12 @@ struct TestScratchAlignment { // and since scratch_ptr3 is then already aligned it difference // should match that if ((scratch_ptr3 - scratch_ptr2) != 32) flag() = 1; + // check actually alignment of ptrs is as requested - if (((scratch_ptr1 % 4) != 0) || ((scratch_ptr2 % 8) != 0) || - ((scratch_ptr3 % 4) != 0)) + // cast to int here to avoid failure with icpx in mixed integer type + // comparison + if ((int(scratch_ptr1 % 4) != 0) || (int(scratch_ptr2 % 8) != 0) || + (int(scratch_ptr3 % 4) != 0)) flag() = 1; }); Kokkos::fence(); From 2e6c2387f5d2ea1348e3cae63f2e18b32bf39fa5 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 20 Jan 2023 15:13:06 -0500 Subject: [PATCH 070/496] Drop KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL --- .../continuous-integration-workflow.yml | 4 +++ core/unit_test/TestMathematicalFunctions.hpp | 33 +++---------------- 2 files changed, 9 insertions(+), 28 deletions(-) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 30238d701d..3198b1d30b 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -12,6 +12,7 @@ jobs: matrix: distro: ['fedora:latest', 'fedora:rawhide', 'ubuntu:latest'] cxx: ['g++', 'clang++'] + cxx_extra_flags: [''] cmake_build_type: ['Release', 'Debug'] backend: ['OPENMP'] clang-tidy: [''] @@ -28,11 +29,13 @@ jobs: clang-tidy: '' - distro: 'fedora:intel' cxx: 'icpx' + cxx_extra_flags: '-fp-model=precise' cmake_build_type: 'Release' backend: 'OPENMP' clang-tidy: '' - distro: 'fedora:intel' cxx: 'icpx' + cxx_extra_flags: '-fp-model=precise' cmake_build_type: 'Debug' backend: 'OPENMP' clang-tidy: '' @@ -100,6 +103,7 @@ jobs: -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DCMAKE_CXX_FLAGS="${{ matrix.cxx_extra_flags }}" \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index b198006cbb..eda880d1de 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -30,12 +30,6 @@ #define MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS #endif -// WORKAROUND icpx changing default FP model when optimization level is >= 1 -// using -fp-model=precise works too -#if defined(__INTEL_LLVM_COMPILER) -#define KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL -#endif - // clang-format off template struct math_unary_function_return_type; @@ -1060,11 +1054,7 @@ struct TestAbsoluteValueFunction { // special values using Kokkos::isinf; using Kokkos::isnan; - if (abs(-0.) != 0. -#ifndef KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL - || !isinf(abs(-INFINITY)) || !isnan(abs(-NAN)) -#endif - ) { + if (abs(-0.) != 0. || !isinf(abs(-INFINITY)) || !isnan(abs(-NAN))) { ++e; KOKKOS_IMPL_DO_NOT_USE_PRINTF( "failed abs(floating_point) special values\n"); @@ -1101,44 +1091,31 @@ struct TestIsNaN { ++e; KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(integral)\n"); } - if (isnan(2.f) -#ifndef KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL - || !isnan(quiet_NaN::value) || + if (isnan(2.f) || !isnan(quiet_NaN::value) || !isnan(signaling_NaN::value) -#endif ) { ++e; KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(float)\n"); } if (isnan(3.) -#ifndef KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC || !isnan(quiet_NaN::value) || !isnan(signaling_NaN::value) -#endif #endif ) { ++e; KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - if (isnan(4.l) -#ifndef KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL - || !isnan(quiet_NaN::value) || - !isnan(signaling_NaN::value) -#endif - ) { + if (isnan(4.l) || !isnan(quiet_NaN::value) || + !isnan(signaling_NaN::value)) { ++e; KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(long double)\n"); } #endif // special values - if (isnan(INFINITY) -#ifndef KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL - || !isnan(NAN) -#endif - ) { + if (isnan(INFINITY) || !isnan(NAN)) { ++e; KOKKOS_IMPL_DO_NOT_USE_PRINTF( "failed isnan(floating_point) special values\n"); From f9d95058a73e8d62777769c1066f1d697efde8c7 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 11 Jan 2023 16:24:33 -0500 Subject: [PATCH 071/496] Add parameter to force using GlobaLMemory launch mechanism using HIP --- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 22 +++++++++++++--------- core/src/Kokkos_Concepts.hpp | 3 +++ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 8bf5d7f394..3887f2e587 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -127,6 +127,8 @@ struct DeduceHIPLaunchMechanism { light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight; static constexpr Kokkos::Experimental::WorkItemProperty::HintHeavyWeight_t heavy_weight = Kokkos::Experimental::WorkItemProperty::HintHeavyWeight; + static constexpr Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t + force_global = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal; static constexpr typename DriverType::Policy::work_item_property property = typename DriverType::Policy::work_item_property(); @@ -160,15 +162,17 @@ struct DeduceHIPLaunchMechanism { // Kal(); constexpr static const ImplWorkItemProperty<8> HintIrregular = ImplWorkItemProperty<8>(); + constexpr static const ImplWorkItemProperty<16> ImplForceGlobal = + ImplWorkItemProperty<16>(); using None_t = ImplWorkItemProperty<0>; using HintLightWeight_t = ImplWorkItemProperty<1>; using HintHeavyWeight_t = ImplWorkItemProperty<2>; using HintRegular_t = ImplWorkItemProperty<4>; using HintIrregular_t = ImplWorkItemProperty<8>; + using ImplForceGlobal_t = ImplWorkItemProperty<16>; }; template From cf67ab4084640d783a94de94afd15f4dc17a12e9 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 11 Jan 2023 21:31:06 -0500 Subject: [PATCH 072/496] Force GlobalMemory launch for some Bessel tests when using ROCm 5.4 --- .../TestMathematicalSpecialFunctions.hpp | 51 ++++++++++++++----- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/core/unit_test/TestMathematicalSpecialFunctions.hpp b/core/unit_test/TestMathematicalSpecialFunctions.hpp index 1b294f26ed..693c6a8889 100644 --- a/core/unit_test/TestMathematicalSpecialFunctions.hpp +++ b/core/unit_test/TestMathematicalSpecialFunctions.hpp @@ -505,7 +505,12 @@ struct TestComplexBesselJ0Y0Function { Kokkos::deep_copy(d_z, h_z); // Call Bessel functions - Kokkos::parallel_for(Kokkos::RangePolicy(0, N), *this); +#if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 4) + using Property = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t; +#else + using Property = Kokkos::Experimental::WorkItemProperty::None_t; +#endif + Kokkos::parallel_for(Kokkos::RangePolicy(0, N), *this); Kokkos::fence(); Kokkos::deep_copy(h_cbj0, d_cbj0); @@ -634,8 +639,8 @@ struct TestComplexBesselJ0Y0Function { Kokkos::deep_copy(d_z_large, h_z_large); - Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), - *this); + Kokkos::parallel_for( + Kokkos::RangePolicy(0, 1), *this); Kokkos::fence(); Kokkos::deep_copy(h_cbj0_large, d_cbj0_large); @@ -795,7 +800,12 @@ struct TestComplexBesselJ1Y1Function { Kokkos::deep_copy(d_z, h_z); // Call Bessel functions - Kokkos::parallel_for(Kokkos::RangePolicy(0, N), *this); +#if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 4) + using Property = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t; +#else + using Property = Kokkos::Experimental::WorkItemProperty::None_t; +#endif + Kokkos::parallel_for(Kokkos::RangePolicy(0, N), *this); Kokkos::fence(); Kokkos::deep_copy(h_cbj1, d_cbj1); @@ -924,8 +934,8 @@ struct TestComplexBesselJ1Y1Function { Kokkos::deep_copy(d_z_large, h_z_large); - Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), - *this); + Kokkos::parallel_for( + Kokkos::RangePolicy(0, 1), *this); Kokkos::fence(); Kokkos::deep_copy(h_cbj1_large, d_cbj1_large); @@ -1083,7 +1093,12 @@ struct TestComplexBesselI0K0Function { Kokkos::deep_copy(d_z, h_z); // Call Bessel functions - Kokkos::parallel_for(Kokkos::RangePolicy(0, N), *this); +#if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 4) + using Property = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t; +#else + using Property = Kokkos::Experimental::WorkItemProperty::None_t; +#endif + Kokkos::parallel_for(Kokkos::RangePolicy(0, N), *this); Kokkos::fence(); Kokkos::deep_copy(h_cbi0, d_cbi0); @@ -1205,8 +1220,8 @@ struct TestComplexBesselI0K0Function { Kokkos::deep_copy(d_z_large, h_z_large); - Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), - *this); + Kokkos::parallel_for( + Kokkos::RangePolicy(0, 1), *this); Kokkos::fence(); Kokkos::deep_copy(h_cbi0_large, d_cbi0_large); @@ -1318,7 +1333,12 @@ struct TestComplexBesselI1K1Function { Kokkos::deep_copy(d_z, h_z); // Call Bessel functions - Kokkos::parallel_for(Kokkos::RangePolicy(0, N), *this); +#if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 4) + using Property = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t; +#else + using Property = Kokkos::Experimental::WorkItemProperty::None_t; +#endif + Kokkos::parallel_for(Kokkos::RangePolicy(0, N), *this); Kokkos::fence(); Kokkos::deep_copy(h_cbi1, d_cbi1); @@ -1440,8 +1460,8 @@ struct TestComplexBesselI1K1Function { Kokkos::deep_copy(d_z_large, h_z_large); - Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), - *this); + Kokkos::parallel_for( + Kokkos::RangePolicy(0, 1), *this); Kokkos::fence(); Kokkos::deep_copy(h_cbi1_large, d_cbi1_large); @@ -1549,7 +1569,12 @@ struct TestComplexBesselH1Function { Kokkos::deep_copy(d_z, h_z); // Call Hankel functions - Kokkos::parallel_for(Kokkos::RangePolicy(0, N), *this); +#if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 4) + using Property = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t; +#else + using Property = Kokkos::Experimental::WorkItemProperty::None_t; +#endif + Kokkos::parallel_for(Kokkos::RangePolicy(0, N), *this); Kokkos::fence(); Kokkos::deep_copy(h_ch10, d_ch10); From d7aa278a4876b129dd5a8a4a24b221d4aec06ab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 23 Jan 2023 15:40:21 +0100 Subject: [PATCH 073/496] Remove obsolete container configuration --- .github/workflows/continuous-integration-workflow.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 30238d701d..656b537123 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -48,8 +48,6 @@ jobs: runs-on: ubuntu-latest container: image: ghcr.io/kokkos/ci-containers/${{ matrix.distro }} - # see https://github.com/actions/virtual-environments/issues/3812 - options: --security-opt seccomp=unconfined steps: - name: Checkout desul uses: actions/checkout@v3 From 29020350e7cb728e080895152eeb2ff76b3c9fb8 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 11 Jan 2023 21:46:18 -0500 Subject: [PATCH 074/496] Fix tests when using ROCm 5.3 --- core/unit_test/TestViewAPI.hpp | 2 ++ core/unit_test/TestViewSubview.hpp | 26 +++++++++++++++++++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/core/unit_test/TestViewAPI.hpp b/core/unit_test/TestViewAPI.hpp index 767f6e5e49..f97515ad3f 100644 --- a/core/unit_test/TestViewAPI.hpp +++ b/core/unit_test/TestViewAPI.hpp @@ -1542,6 +1542,7 @@ class TestViewAPI { } static void run_test_error() { +#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) #ifdef KOKKOS_ENABLE_OPENMPTARGET if (std::is_same::value) @@ -1586,6 +1587,7 @@ class TestViewAPI { } #endif } +#endif } }; diff --git a/core/unit_test/TestViewSubview.hpp b/core/unit_test/TestViewSubview.hpp index 21086eb3d7..ff412c7550 100644 --- a/core/unit_test/TestViewSubview.hpp +++ b/core/unit_test/TestViewSubview.hpp @@ -136,8 +136,9 @@ struct fill_2D { template void test_auto_1d() { - using mv_type = Kokkos::View; - using size_type = typename mv_type::size_type; + using mv_type = Kokkos::View; + using execution_space = typename Space::execution_space; + using size_type = typename mv_type::size_type; const double ZERO = 0.0; const double ONE = 1.0; @@ -150,7 +151,13 @@ void test_auto_1d() { typename mv_type::HostMirror X_h = Kokkos::create_mirror_view(X); fill_2D f1(X, ONE); - Kokkos::parallel_for(X.extent(0), f1); +#if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3) + using Property = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t; +#else + using Property = Kokkos::Experimental::WorkItemProperty::None_t; +#endif + Kokkos::parallel_for( + Kokkos::RangePolicy(0, X.extent(0)), f1); Kokkos::fence(); Kokkos::deep_copy(X_h, X); for (size_type j = 0; j < numCols; ++j) { @@ -160,7 +167,8 @@ void test_auto_1d() { } fill_2D f2(X, 0.0); - Kokkos::parallel_for(X.extent(0), f2); + Kokkos::parallel_for( + Kokkos::RangePolicy(0, X.extent(0)), f2); Kokkos::fence(); Kokkos::deep_copy(X_h, X); for (size_type j = 0; j < numCols; ++j) { @@ -170,7 +178,8 @@ void test_auto_1d() { } fill_2D f3(X, TWO); - Kokkos::parallel_for(X.extent(0), f3); + Kokkos::parallel_for( + Kokkos::RangePolicy(0, X.extent(0)), f3); Kokkos::fence(); Kokkos::deep_copy(X_h, X); for (size_type j = 0; j < numCols; ++j) { @@ -183,7 +192,8 @@ void test_auto_1d() { auto X_j = Kokkos::subview(X, Kokkos::ALL, j); fill_1D f4(X_j, ZERO); - Kokkos::parallel_for(X_j.extent(0), f4); + Kokkos::parallel_for( + Kokkos::RangePolicy(0, X_j.extent(0)), f4); Kokkos::fence(); Kokkos::deep_copy(X_h, X); for (size_type i = 0; i < numRows; ++i) { @@ -193,7 +203,9 @@ void test_auto_1d() { for (size_type jj = 0; jj < numCols; ++jj) { auto X_jj = Kokkos::subview(X, Kokkos::ALL, jj); fill_1D f5(X_jj, ONE); - Kokkos::parallel_for(X_jj.extent(0), f5); + Kokkos::parallel_for( + Kokkos::RangePolicy(0, X_jj.extent(0)), + f5); Kokkos::fence(); Kokkos::deep_copy(X_h, X); for (size_type i = 0; i < numRows; ++i) { From 0f8b7ca3acf9cb49c5410fef41b9af74a00666a5 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 23 Jan 2023 10:20:16 -0500 Subject: [PATCH 075/496] Skip test and add comment explaining why --- core/unit_test/TestViewAPI.hpp | 2 -- core/unit_test/TestViewAPI_d.hpp | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestViewAPI.hpp b/core/unit_test/TestViewAPI.hpp index f97515ad3f..767f6e5e49 100644 --- a/core/unit_test/TestViewAPI.hpp +++ b/core/unit_test/TestViewAPI.hpp @@ -1542,7 +1542,6 @@ class TestViewAPI { } static void run_test_error() { -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) #ifdef KOKKOS_ENABLE_OPENMPTARGET if (std::is_same::value) @@ -1587,7 +1586,6 @@ class TestViewAPI { } #endif } -#endif } }; diff --git a/core/unit_test/TestViewAPI_d.hpp b/core/unit_test/TestViewAPI_d.hpp index f65ce06dad..da61a2873a 100644 --- a/core/unit_test/TestViewAPI_d.hpp +++ b/core/unit_test/TestViewAPI_d.hpp @@ -26,8 +26,11 @@ TEST(TEST_CATEGORY, view_api_d) { TestViewAPI::run_test_view_operator_c(); } +// ROCm 5.3 segfaults when trying to allocate too much memory +#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) TEST(TEST_CATEGORY, view_allocation_error) { TestViewAPI::run_test_error(); } +#endif } // namespace Test From b99fb31e3fceb1c84ba67b0ba6fcd9293fa19aea Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 23 Jan 2023 12:53:02 -0500 Subject: [PATCH 076/496] Use GTEST_SKIP to skip test --- core/unit_test/TestViewAPI_d.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/unit_test/TestViewAPI_d.hpp b/core/unit_test/TestViewAPI_d.hpp index da61a2873a..e4751771a0 100644 --- a/core/unit_test/TestViewAPI_d.hpp +++ b/core/unit_test/TestViewAPI_d.hpp @@ -26,11 +26,11 @@ TEST(TEST_CATEGORY, view_api_d) { TestViewAPI::run_test_view_operator_c(); } -// ROCm 5.3 segfaults when trying to allocate too much memory -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) TEST(TEST_CATEGORY, view_allocation_error) { +#if ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) + GTEST_SKIP() << "ROCm 5.3 segfaults when trying to allocate too much memor"; +#endif TestViewAPI::run_test_error(); } -#endif } // namespace Test From 478f087b2cf4a1cb29f901c3c135677d7d07baa1 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 23 Jan 2023 13:01:21 -0500 Subject: [PATCH 077/496] Fix typo Co-authored-by: Damien L-G --- core/unit_test/TestViewAPI_d.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestViewAPI_d.hpp b/core/unit_test/TestViewAPI_d.hpp index e4751771a0..08d21f5449 100644 --- a/core/unit_test/TestViewAPI_d.hpp +++ b/core/unit_test/TestViewAPI_d.hpp @@ -28,7 +28,7 @@ TEST(TEST_CATEGORY, view_api_d) { TEST(TEST_CATEGORY, view_allocation_error) { #if ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) - GTEST_SKIP() << "ROCm 5.3 segfaults when trying to allocate too much memor"; + GTEST_SKIP() << "ROCm 5.3 segfaults when trying to allocate too much memory"; #endif TestViewAPI::run_test_error(); } From 5b3b6e7e144c05c6c34aed8b36455cee4861c70e Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 23 Jan 2023 15:37:29 -0500 Subject: [PATCH 078/496] Rename ImplForceGlobal to ImplForceGlobalLaunch --- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 7 ++++--- core/src/Kokkos_Concepts.hpp | 14 +++++++------- .../TestMathematicalSpecialFunctions.hpp | 15 ++++++++++----- core/unit_test/TestViewSubview.hpp | 3 ++- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 3887f2e587..e2fe5a6d83 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -127,8 +127,9 @@ struct DeduceHIPLaunchMechanism { light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight; static constexpr Kokkos::Experimental::WorkItemProperty::HintHeavyWeight_t heavy_weight = Kokkos::Experimental::WorkItemProperty::HintHeavyWeight; - static constexpr Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t - force_global = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal; + static constexpr Kokkos::Experimental::WorkItemProperty:: + ImplForceGlobalLaunch_t force_global_launch = + Kokkos::Experimental::WorkItemProperty::ImplForceGlobalLaunch; static constexpr typename DriverType::Policy::work_item_property property = typename DriverType::Policy::work_item_property(); @@ -162,7 +163,7 @@ struct DeduceHIPLaunchMechanism { // Kal(); constexpr static const ImplWorkItemProperty<8> HintIrregular = ImplWorkItemProperty<8>(); - constexpr static const ImplWorkItemProperty<16> ImplForceGlobal = + constexpr static const ImplWorkItemProperty<16> ImplForceGlobalLaunch = ImplWorkItemProperty<16>(); - using None_t = ImplWorkItemProperty<0>; - using HintLightWeight_t = ImplWorkItemProperty<1>; - using HintHeavyWeight_t = ImplWorkItemProperty<2>; - using HintRegular_t = ImplWorkItemProperty<4>; - using HintIrregular_t = ImplWorkItemProperty<8>; - using ImplForceGlobal_t = ImplWorkItemProperty<16>; + using None_t = ImplWorkItemProperty<0>; + using HintLightWeight_t = ImplWorkItemProperty<1>; + using HintHeavyWeight_t = ImplWorkItemProperty<2>; + using HintRegular_t = ImplWorkItemProperty<4>; + using HintIrregular_t = ImplWorkItemProperty<8>; + using ImplForceGlobalLaunch_t = ImplWorkItemProperty<16>; }; template diff --git a/core/unit_test/TestMathematicalSpecialFunctions.hpp b/core/unit_test/TestMathematicalSpecialFunctions.hpp index 693c6a8889..071056ce34 100644 --- a/core/unit_test/TestMathematicalSpecialFunctions.hpp +++ b/core/unit_test/TestMathematicalSpecialFunctions.hpp @@ -506,7 +506,8 @@ struct TestComplexBesselJ0Y0Function { // Call Bessel functions #if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 4) - using Property = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t; + using Property = + Kokkos::Experimental::WorkItemProperty::ImplForceGlobalLaunchLaunch_t; #else using Property = Kokkos::Experimental::WorkItemProperty::None_t; #endif @@ -801,7 +802,8 @@ struct TestComplexBesselJ1Y1Function { // Call Bessel functions #if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 4) - using Property = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t; + using Property = + Kokkos::Experimental::WorkItemProperty::ImplForceGlobalLaunch_t; #else using Property = Kokkos::Experimental::WorkItemProperty::None_t; #endif @@ -1094,7 +1096,8 @@ struct TestComplexBesselI0K0Function { // Call Bessel functions #if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 4) - using Property = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t; + using Property = + Kokkos::Experimental::WorkItemProperty::ImplForceGlobalLaunch_t; #else using Property = Kokkos::Experimental::WorkItemProperty::None_t; #endif @@ -1334,7 +1337,8 @@ struct TestComplexBesselI1K1Function { // Call Bessel functions #if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 4) - using Property = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t; + using Property = + Kokkos::Experimental::WorkItemProperty::ImplForceGlobalLaunch_t; #else using Property = Kokkos::Experimental::WorkItemProperty::None_t; #endif @@ -1570,7 +1574,8 @@ struct TestComplexBesselH1Function { // Call Hankel functions #if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 4) - using Property = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t; + using Property = + Kokkos::Experimental::WorkItemProperty::ImplForceGlobalLaunch_t; #else using Property = Kokkos::Experimental::WorkItemProperty::None_t; #endif diff --git a/core/unit_test/TestViewSubview.hpp b/core/unit_test/TestViewSubview.hpp index ff412c7550..f1cf9e4bab 100644 --- a/core/unit_test/TestViewSubview.hpp +++ b/core/unit_test/TestViewSubview.hpp @@ -152,7 +152,8 @@ void test_auto_1d() { fill_2D f1(X, ONE); #if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3) - using Property = Kokkos::Experimental::WorkItemProperty::ImplForceGlobal_t; + using Property = + Kokkos::Experimental::WorkItemProperty::ImplForceGlobalLaunch_t; #else using Property = Kokkos::Experimental::WorkItemProperty::None_t; #endif From e4b3c8269be8d37a8de0feed6ec81cbcb932da61 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 23 Jan 2023 16:33:56 -0500 Subject: [PATCH 079/496] SYCL: Add support for arbitrary size atomics --- cmake/KokkosCore_config.h.in | 1 + cmake/kokkos_arch.cmake | 29 ++++ core/src/SYCL/Kokkos_SYCL_Instance.cpp | 16 +- core/unit_test/CMakeLists.txt | 5 - .../TestAtomicOperations_complexdouble.hpp | 4 + core/unit_test/TestAtomics.hpp | 8 +- .../include/desul/atomics/Adapt_SYCL.hpp | 10 ++ .../desul/atomics/Compare_Exchange_SYCL.hpp | 59 ++++++- .../include/desul/atomics/Lock_Array.hpp | 3 + .../include/desul/atomics/Lock_Array_SYCL.hpp | 147 ++++++++++++++++++ .../desul/atomics/Lock_Based_Fetch_Op.hpp | 2 +- .../atomics/Lock_Based_Fetch_Op_SYCL.hpp | 94 +++++++++++ tpls/desul/src/Lock_Array_SYCL.cpp | 78 ++++++++++ 13 files changed, 440 insertions(+), 16 deletions(-) create mode 100644 tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp create mode 100644 tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp create mode 100644 tpls/desul/src/Lock_Array_SYCL.cpp diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 40b341d989..863b800a86 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -26,6 +26,7 @@ #cmakedefine KOKKOS_ENABLE_MEMKIND #cmakedefine KOKKOS_ENABLE_LIBRT #cmakedefine KOKKOS_ENABLE_SYCL +#cmakedefine KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED /* General Settings */ #cmakedefine KOKKOS_ENABLE_CXX17 diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index b051f8e3bd..6e754bd903 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -523,6 +523,35 @@ IF (KOKKOS_ENABLE_SYCL) ) ENDIF() +# Check support for device_global variables +# FIXME_SYCL Once the feature test macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL is +# available, use that instead. +IF(KOKKOS_ENABLE_SYCL) + INCLUDE(CheckCXXSourceCompiles) + STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + CHECK_CXX_SOURCE_COMPILES(" + #include + using namespace sycl::ext::oneapi::experimental; + using namespace sycl; + + SYCL_EXTERNAL device_global Foo; + + void bar(queue q) { + q.single_task([=] { + Foo = 42; + }); + } + + int main(){ return 0; } + " + KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED) + + IF(KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + ) + ENDIF() +ENDIF() SET(CUDA_ARCH_ALREADY_SPECIFIED "") FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 0e1738d6ac..ef3cd4a2b4 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -127,6 +127,14 @@ void SYCLInternal::initialize(const sycl::queue& q) { Kokkos::Impl::throw_runtime_exception(msg.str()); } +#ifdef KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED + // Init the array for used for arbitrarily sized atomics + if (this == &singleton()) { + desul::Impl::init_lock_arrays(); + desul::Impl::init_lock_arrays_sycl(*m_queue); + } +#endif + m_team_scratch_current_size = 0; m_team_scratch_ptr = nullptr; } @@ -160,7 +168,13 @@ void SYCLInternal::finalize() { // The global_unique_token_locks array is static and should only be // deallocated once by the defualt instance - if (this == &singleton()) Impl::sycl_global_unique_token_locks(true); + if (this == &singleton()) { + Impl::sycl_global_unique_token_locks(true); +#ifdef KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED + desul::Impl::finalize_lock_arrays(); + desul::Impl::finalize_lock_arrays_sycl(*m_queue); +#endif + } using RecordSYCL = Kokkos::Impl::SharedAllocationRecord; if (nullptr != m_scratchSpace) diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 8019e5f3bb..e32bf09fd4 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -742,11 +742,6 @@ if(Kokkos_ENABLE_HIP) endif() if(Kokkos_ENABLE_SYCL) - list(REMOVE_ITEM SYCL_SOURCES1A - # FIXME_SYCL atomic_fetch_oper for large types to be implemented - ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_complexdouble.cpp - ) - list(REMOVE_ITEM SYCL_SOURCES2A ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_WorkGraph.cpp ) diff --git a/core/unit_test/TestAtomicOperations_complexdouble.hpp b/core/unit_test/TestAtomicOperations_complexdouble.hpp index 9f55aa947b..1b3f52e2cc 100644 --- a/core/unit_test/TestAtomicOperations_complexdouble.hpp +++ b/core/unit_test/TestAtomicOperations_complexdouble.hpp @@ -18,6 +18,10 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_complexdouble) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED) + if (std::is_same_v) + GTEST_SKIP() << "skipping since device_global variables are not available"; +#endif const int start = 1; // Avoid zero for division. const int end = 11; for (int i = start; i < end; ++i) { diff --git a/core/unit_test/TestAtomics.hpp b/core/unit_test/TestAtomics.hpp index e5866bb89b..479a698b52 100644 --- a/core/unit_test/TestAtomics.hpp +++ b/core/unit_test/TestAtomics.hpp @@ -510,8 +510,11 @@ TEST(TEST_CATEGORY, atomics) { ASSERT_TRUE( (TestAtomic::Loop, TEST_EXECSPACE>(100, 3))); -// FIXME_SYCL atomics for large types to be implemented -#ifndef KOKKOS_ENABLE_SYCL +// FIXME_SYCL Replace macro by SYCL_EXT_ONEAPI_DEVICE_GLOBAL or remove +// condition alltogether when possible. +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED) + if (std::is_same_v) return; +#endif ASSERT_TRUE( (TestAtomic::Loop, TEST_EXECSPACE>(1, 1))); ASSERT_TRUE( @@ -536,7 +539,6 @@ TEST(TEST_CATEGORY, atomics) { (TestAtomic::Loop, TEST_EXECSPACE>(100, 3))); #endif #endif -#endif } } // namespace Test diff --git a/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp b/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp index c8449d495d..c9bd503679 100644 --- a/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp +++ b/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp @@ -111,6 +111,16 @@ using sycl_atomic_ref = sycl::atomic_ref; #endif +// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead +#ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED +// FIXME_SYCL The compiler forces us to use device_image_scope. Drop this when possible. +template +using sycl_device_global = sycl::ext::oneapi::experimental::device_global< + T, + decltype(sycl::ext::oneapi::experimental::properties( + sycl::ext::oneapi::experimental::device_image_scope))>; +#endif + } // namespace Impl } // namespace desul diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp index 34e36bc4e4..43b4fb56f9 100644 --- a/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp +++ b/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp @@ -11,6 +11,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #include #include +#include #include // FIXME_SYCL SYCL2020 dictates that is the header to include @@ -78,16 +79,62 @@ std::enable_if_t device_atomic_exchange(T* const dest, template std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> device_atomic_compare_exchange( - T* const /*dest*/, T compare, T /*value*/, MemoryOrder, MemoryScope) { - assert(false); // FIXME_SYCL not implemented - return compare; + T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // This is a way to avoid deadlock in a subgroup + T return_val; + int done = 0; + auto sg = sycl::ext::oneapi::experimental::this_sub_group(); + using sycl::ext::oneapi::group_ballot; + using sycl::ext::oneapi::sub_group_mask; + sub_group_mask active = group_ballot(sg, 1); + sub_group_mask done_active = group_ballot(sg, 0); + while (active != done_active) { + if (!done) { + if (lock_address_sycl((void*)dest, scope)) { + if (std::is_same::value) + atomic_thread_fence(MemoryOrderRelease(), scope); + atomic_thread_fence(MemoryOrderAcquire(), scope); + return_val = *dest; + if (return_val == compare) { + *dest = value; + device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + unlock_address_sycl((void*)dest, scope); + done = 1; + } + } + done_active = group_ballot(sg, done); + } + return return_val; } template std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> device_atomic_exchange( - T* const /*dest*/, T value, MemoryOrder, MemoryScope) { - assert(false); // FIXME_SYCL not implemented - return value; + T* const dest, T value, MemoryOrder, MemoryScope scope) { + // This is a way to avoid deadlock in a subgroup + T return_val; + int done = 0; + auto sg = sycl::ext::oneapi::experimental::this_sub_group(); + using sycl::ext::oneapi::group_ballot; + using sycl::ext::oneapi::sub_group_mask; + sub_group_mask active = group_ballot(sg, 1); + sub_group_mask done_active = group_ballot(sg, 0); + while (active != done_active) { + if (!done) { + if (lock_address_sycl((void*)dest, scope)) { + if (std::is_same::value) + atomic_thread_fence(MemoryOrderRelease(), scope); + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + return_val = *dest; + *dest = value; + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address_sycl((void*)dest, scope); + done = 1; + } + } + done_active = group_ballot(sg, done); + } + return return_val; } } // namespace Impl diff --git a/tpls/desul/include/desul/atomics/Lock_Array.hpp b/tpls/desul/include/desul/atomics/Lock_Array.hpp index a5af4c48c2..92f7ac3891 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array.hpp @@ -17,6 +17,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_HIP_ATOMICS #include #endif +#ifdef DESUL_HAVE_SYCL_ATOMICS +#include +#endif namespace desul { namespace Impl { diff --git a/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp b/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp new file mode 100644 index 0000000000..8f42c6b37e --- /dev/null +++ b/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp @@ -0,0 +1,147 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_LOCK_ARRAY_SYCL_HPP_ +#define DESUL_ATOMICS_LOCK_ARRAY_SYCL_HPP_ + +#include + +#include "desul/atomics/Adapt_SYCL.hpp" +#include "desul/atomics/Common.hpp" +#include "desul/atomics/Macros.hpp" + +// FIXME_SYCL +#if __has_include() +#include +#else +#include +#endif + +namespace desul { +namespace Impl { + +// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead +#ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + +/** + * \brief This global variable in Host space is the central definition of these + * arrays. + */ +extern int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; +extern int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h; + +/// \brief After this call, the lock arrays used in [un]lock_address_sycl +/// are initialized and ready to be used. +/// +/// This call is idempotent. +/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA +/// snapshotted version while also linking against pure Desul +template +void init_lock_arrays_sycl(sycl::queue q); + +/// \brief After this call, the lock arrays used in [un]lock_address_sycl +/// are freed and can't be used anymore. +/// +/// This call is idempotent. +/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA +/// snapshotted version while also linking against pure Desul +template +void finalize_lock_arrays_sycl(sycl::queue q); + +/** + * \brief This global variable in SYCL space is what kernels use to get access + * to the lock arrays. + * + * There is only one single instance of this global variable for the entire + * executable, whose definition will be in Kokkos_SYCL_Locks.cpp (and whose + * declaration here must be extern). This one instance will be initialized + * by initialize_host_sycl_lock_arrays and need not be modified afterwards. + */ +SYCL_EXTERNAL extern sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_DEVICE; + +SYCL_EXTERNAL extern sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; + +#define SYCL_SPACE_ATOMIC_MASK 0x1FFFF + +/// \brief Acquire a lock for the address +/// +/// This function tries to acquire the lock for the hash value derived +/// from the provided ptr. If the lock is successfully acquired the +/// function returns true. Otherwise it returns false. +inline bool lock_address_sycl(void* ptr, MemoryScopeDevice) { + size_t offset = size_t(ptr); + offset = offset >> 2; + offset = offset & SYCL_SPACE_ATOMIC_MASK; + sycl::atomic_ref + lock_device_ref(SYCL_SPACE_ATOMIC_LOCKS_DEVICE[offset]); + return (0 == lock_device_ref.exchange(1)); +} + +inline bool lock_address_sycl(void* ptr, MemoryScopeNode) { + size_t offset = size_t(ptr); + offset = offset >> 2; + offset = offset & SYCL_SPACE_ATOMIC_MASK; + sycl::atomic_ref + lock_node_ref(SYCL_SPACE_ATOMIC_LOCKS_NODE[offset]); + return (0 == lock_node_ref.exchange(1)); +} + +/** + * \brief Release lock for the address + * + * This function releases the lock for the hash value derived from the provided + * ptr. This function should only be called after previously successfully + * acquiring a lock with lock_address. + */ +inline void unlock_address_sycl(void* ptr, MemoryScopeDevice) { + size_t offset = size_t(ptr); + offset = offset >> 2; + offset = offset & SYCL_SPACE_ATOMIC_MASK; + sycl::atomic_ref + lock_device_ref(SYCL_SPACE_ATOMIC_LOCKS_DEVICE[offset]); + lock_device_ref.exchange(0); +} + +inline void unlock_address_sycl(void* ptr, MemoryScopeNode) { + size_t offset = size_t(ptr); + offset = offset >> 2; + offset = offset & SYCL_SPACE_ATOMIC_MASK; + sycl::atomic_ref + lock_node_ref(SYCL_SPACE_ATOMIC_LOCKS_NODE[offset]); + lock_node_ref.exchange(0); +} +#else +inline bool lock_address_sycl(void*, MemoryScopeDevice) { + assert(false); + return true; +} + +inline bool lock_address_sycl(void*, MemoryScopeNode) { + assert(false); + return true; +} + +inline void unlock_address_sycl(void*, MemoryScopeDevice) { assert(false); } + +inline void unlock_address_sycl(void*, MemoryScopeNode) { assert(false); } +#endif +} // namespace Impl +} // namespace desul +#endif diff --git a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp index e7c36673e2..cb97f4a906 100644 --- a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp @@ -18,7 +18,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #include #endif #ifdef DESUL_HAVE_SYCL_ATOMICS -#include +#include #endif #include diff --git a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp new file mode 100644 index 0000000000..8774a6e96e --- /dev/null +++ b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp @@ -0,0 +1,94 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_LOCK_BASED_FETCH_OP_SYCL_HPP_ +#define DESUL_ATOMICS_LOCK_BASED_FETCH_OP_SYCL_HPP_ + +#include +#include +#include +#include + +namespace desul { +namespace Impl { + +template = 0> +T device_atomic_fetch_oper(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t val, + MemoryOrder /*order*/, + MemoryScope scope) { + // This is a way to avoid deadlock in a subgroup + T return_val; + int done = 0; + auto sg = sycl::ext::oneapi::experimental::this_sub_group(); + using sycl::ext::oneapi::group_ballot; + using sycl::ext::oneapi::sub_group_mask; + sub_group_mask active = group_ballot(sg, 1); + sub_group_mask done_active = group_ballot(sg, 0); + while (active != done_active) { + if (!done) { + if (lock_address_sycl((void*)dest, scope)) { + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + return_val = *dest; + *dest = op.apply(return_val, val); + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address_sycl((void*)dest, scope); + done = 1; + } + } + done_active = group_ballot(sg, done); + } + return return_val; +} + +template = 0> +T device_atomic_oper_fetch(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t val, + MemoryOrder /*order*/, + MemoryScope scope) { + // This is a way to avoid deadlock in a subgroup + T return_val; + int done = 0; + auto sg = sycl::ext::oneapi::experimental::this_sub_group(); + using sycl::ext::oneapi::group_ballot; + using sycl::ext::oneapi::sub_group_mask; + sub_group_mask active = group_ballot(sg, 1); + sub_group_mask done_active = group_ballot(sg, 0); + while (active != done_active) { + if (!done) { + if (lock_address_sycl((void*)dest, scope)) { + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + return_val = op.apply(*dest, val); + *dest = return_val; + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address_sycl((void*)dest, scope); + done = 1; + } + } + done_active = group_ballot(sg, done); + } + return return_val; +} +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/src/Lock_Array_SYCL.cpp b/tpls/desul/src/Lock_Array_SYCL.cpp new file mode 100644 index 0000000000..6bc9a890a8 --- /dev/null +++ b/tpls/desul/src/Lock_Array_SYCL.cpp @@ -0,0 +1,78 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead +#ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + +#include +#include + +namespace desul { +namespace Impl { +SYCL_EXTERNAL +sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_DEVICE; +SYCL_EXTERNAL +sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; +} // namespace Impl +} // namespace desul + +namespace desul { +namespace Impl { + +int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; +int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; + +template +void init_lock_arrays_sycl(sycl::queue q) { + if (SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return; + + SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = + sycl::malloc_device(SYCL_SPACE_ATOMIC_MASK + 1, q); + SYCL_SPACE_ATOMIC_LOCKS_NODE_h = + sycl::malloc_host(SYCL_SPACE_ATOMIC_MASK + 1, q); + + // FIXME_SYCL Once supported, the following should be replaced by + // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, + // &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, + // sizeof(int32_t*)); + // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, + // &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, + // sizeof(int32_t*)); + auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; + auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; + q.single_task([=] { + SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; + SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; + }); + + q.memset(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, + 0, + sizeof(int32_t) * (SYCL_SPACE_ATOMIC_MASK + 1)); + q.memset(SYCL_SPACE_ATOMIC_LOCKS_NODE_h, + 0, + sizeof(int32_t) * (SYCL_SPACE_ATOMIC_MASK + 1)); + + q.wait_and_throw(); +} + +template +void finalize_lock_arrays_sycl(sycl::queue q) { + if (SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return; + + sycl::free(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, q); + sycl::free(SYCL_SPACE_ATOMIC_LOCKS_NODE_h, q); + SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; + SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; +} + +template void init_lock_arrays_sycl(sycl::queue); +template void finalize_lock_arrays_sycl(sycl::queue); + +} // namespace Impl +} // namespace desul +#endif From 53cc29764415a73732e66a730501f01b3756cd2a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 23 Jan 2023 17:34:23 -0500 Subject: [PATCH 080/496] Improve comments in TestTeam.hpp --- core/unit_test/TestTeam.hpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/core/unit_test/TestTeam.hpp b/core/unit_test/TestTeam.hpp index fc01e9caab..3222ac8d91 100644 --- a/core/unit_test/TestTeam.hpp +++ b/core/unit_test/TestTeam.hpp @@ -1615,10 +1615,9 @@ struct TestScratchAlignment { flag() = 1; // Now request aligned memory such that the allocation after - // for scratch_ptr2 would be unaligned if it doesn't pad - // correct. - // Depending on whether scratch_ptr3 is 4 or 8 byte aligned - // we need to request different amount of memory. + // scratch_ptr2 would be unaligned if it doesn't pad correctly. + // Depending on scratch_ptr3 being 4 or 8 byte aligned + // we need to request a different amount of memory. if ((scratch_ptr3 + 12) % 8 == 4) scratch_ptr1 = reinterpret_cast( team.team_shmem().get_shmem_aligned(24, 4)); @@ -1631,16 +1630,14 @@ struct TestScratchAlignment { scratch_ptr3 = reinterpret_cast( team.team_shmem().get_shmem_aligned(8, 4)); - // note the difference between scratch_ptr2 and scratch_ptr1 - // is 4 bytes larger than what we requested in either of the - // two cases. + // The difference between scratch_ptr2 and scratch_ptr1 should be 4 + // bytes larger than what we requested in either case. if (((scratch_ptr2 - scratch_ptr1) != 28) && ((scratch_ptr2 - scratch_ptr1) != 16)) flag() = 1; - // check that there wasn't unnneccessary padding happening - // i.e. scratch_ptr2 was allocated with a 32 byte request - // and since scratch_ptr3 is then already aligned it difference - // should match that + // Check that there wasn't unneccessary padding happening. Since + // scratch_ptr2 was allocated with a 32 byte request and scratch_ptr3 + // is then already aligned, its difference should match 32 bytes. if ((scratch_ptr3 - scratch_ptr2) != 32) flag() = 1; // check actually alignment of ptrs is as requested if (((scratch_ptr1 % 4) != 0) || ((scratch_ptr2 % 8) != 0) || From aa0f81e358eb9025fd04a55de552a1babb28d38b Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 24 Jan 2023 09:01:23 -0500 Subject: [PATCH 081/496] Replace HIP_LOCK_ARRAYS macros by functions (#5770) * Replace HIP_LOCK_ARRAYS macros by functions * Use HIP_SYMBOL again * Desul indentation * Remove fence --- core/src/HIP/Kokkos_HIP_Instance.cpp | 8 --- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 2 +- core/src/HIP/Kokkos_HIP_Locks.cpp | 8 ++- core/src/HIP/Kokkos_HIP_Locks.hpp | 54 ++++++++++--------- .../include/desul/atomics/Lock_Array.hpp | 2 +- .../include/desul/atomics/Lock_Array_HIP.hpp | 49 +++++++++-------- tpls/desul/src/Lock_Array_HIP.cpp | 4 +- 7 files changed, 62 insertions(+), 65 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 28c9c1cb6a..c66cee0c28 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -397,14 +397,6 @@ Kokkos::HIP::size_type *hip_internal_scratch_flags(const HIP &instance, namespace Kokkos { namespace Impl { -void hip_device_synchronize(const std::string &name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, - Kokkos::Tools::Experimental::SpecialSynchronizationCases:: - GlobalDeviceSynchronization, - [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); }); -} - void hip_internal_error_throw(hipError_t e, const char *name, const char *file, const int line) { std::ostringstream out; diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index e2fe5a6d83..11975fc25b 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -462,7 +462,7 @@ struct HIPParallelLaunch< "HIPParallelLaunch FAILED: shared memory request is too large"); } - KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); + ensure_hip_lock_arrays_on_device(); // Invoke the driver function on the device base_t::invoke_kernel(driver, grid, block, shmem, hip_instance); diff --git a/core/src/HIP/Kokkos_HIP_Locks.cpp b/core/src/HIP/Kokkos_HIP_Locks.cpp index 76d3f6f5c8..62058b4161 100644 --- a/core/src/HIP/Kokkos_HIP_Locks.cpp +++ b/core/src/HIP/Kokkos_HIP_Locks.cpp @@ -55,8 +55,7 @@ HIPLockArrays g_host_hip_lock_arrays = {nullptr, 0}; void initialize_host_hip_lock_arrays() { #ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS desul::Impl::init_lock_arrays(); - - DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); + desul::ensure_hip_lock_arrays_on_device(); #endif if (g_host_hip_lock_arrays.atomic != nullptr) return; @@ -65,8 +64,7 @@ void initialize_host_hip_lock_arrays() { sizeof(std::int32_t) * (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1))); g_host_hip_lock_arrays.n = HIPInternal::concurrency(); - - KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE(); + copy_hip_lock_arrays_to_device(); init_lock_array_kernel_atomic<<< (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256, 0, nullptr>>>(); } @@ -81,7 +79,7 @@ void finalize_host_hip_lock_arrays() { g_host_hip_lock_arrays.atomic = nullptr; g_host_hip_lock_arrays.n = 0; #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE - KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE(); + copy_hip_lock_arrays_to_device(); #endif } diff --git a/core/src/HIP/Kokkos_HIP_Locks.hpp b/core/src/HIP/Kokkos_HIP_Locks.hpp index 0d46d74074..e0bce0ccfb 100644 --- a/core/src/HIP/Kokkos_HIP_Locks.hpp +++ b/core/src/HIP/Kokkos_HIP_Locks.hpp @@ -60,7 +60,7 @@ void finalize_host_hip_lock_arrays(); /// instance of this global variable for the entire executable, /// whose definition will be in Kokkos_HIP_Locks.cpp (and whose declaration /// here must then be extern). -/// This one instance will be initialized by initialize_host_HIP_lock_arrays +/// This one instance will be initialized by initialize_host_hip_lock_arrays /// and need not be modified afterwards. /// /// When relocatable device code is disabled, an instance of this variable @@ -69,7 +69,7 @@ void finalize_host_hip_lock_arrays(); /// instances in other translation units, we must update this HIP global /// variable based on the Host global variable prior to running any kernels /// that will use it. -/// That is the purpose of the KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE macro. +/// That is the purpose of the ensure_hip_lock_arrays_on_device function. __device__ #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE __constant__ extern @@ -94,7 +94,7 @@ __device__ inline bool lock_address_hip_space(void* ptr) { /// /// This function releases the lock for the hash value derived /// from the provided ptr. This function should only be called -/// after previously successfully aquiring a lock with +/// after previously successfully acquiring a lock with /// lock_address. __device__ inline void unlock_address_hip_space(void* ptr) { auto offset = reinterpret_cast(ptr); @@ -112,45 +112,49 @@ namespace Impl { namespace { static int lock_array_copied = 0; } // namespace -} // namespace Impl -} // namespace Kokkos -/* Dan Ibanez: it is critical that this code be a macro, so that it will - capture the right address for g_device_hip_lock_arrays! - putting this in an inline function will NOT do the right thing! */ -#define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() \ - { \ - if (::Kokkos::Impl::lock_array_copied == 0) { \ - KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbol( \ - HIP_SYMBOL(::Kokkos::Impl::g_device_hip_lock_arrays), \ - &::Kokkos::Impl::g_host_hip_lock_arrays, \ - sizeof(::Kokkos::Impl::HIPLockArrays))); \ - } \ - ::Kokkos::Impl::lock_array_copied = 1; \ +#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE +inline +#else +inline static +#endif + void + copy_hip_lock_arrays_to_device() { + if (lock_array_copied == 0) { + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemcpyToSymbol(HIP_SYMBOL(g_device_hip_lock_arrays), + &g_host_hip_lock_arrays, sizeof(HIPLockArrays))); } + lock_array_copied = 1; +} #ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE -#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() +inline void ensure_hip_lock_arrays_on_device() {} #else -#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \ - KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() +inline static void ensure_hip_lock_arrays_on_device() { + copy_hip_lock_arrays_to_device(); +} #endif #else #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE -#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() +inline void ensure_hip_lock_arrays_on_device() {} #else -// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc. -#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \ - KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() \ - DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() +// Still Need copy_hip_lock_arrays for team scratch etc. +inline static void ensure_hip_lock_arrays_on_device() { + copy_hip_lock_arrays_to_device(); + desul::ensure_hip_lock_arrays_on_device(); +} #endif #endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */ +} // namespace Impl +} // namespace Kokkos + #endif /* defined( __HIPCC__ ) */ #endif /* #ifndef KOKKOS_HIP_LOCKS_HPP */ diff --git a/tpls/desul/include/desul/atomics/Lock_Array.hpp b/tpls/desul/include/desul/atomics/Lock_Array.hpp index a5af4c48c2..94ec4187a9 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array.hpp @@ -67,7 +67,7 @@ inline void ensure_lock_arrays_on_device() { #endif #ifdef DESUL_HAVE_HIP_ATOMICS - DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); + ensure_hip_lock_arrays_on_device(); #endif } diff --git a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp index 1ce7673225..47c0e8c680 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp @@ -35,7 +35,7 @@ extern int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE_h; template void init_lock_arrays_hip(); -/// \brief After this call, the g_host_cuda_lock_arrays variable has +/// \brief After this call, the g_host_hip_lock_arrays variable has /// all null pointers, and all array memory has been freed. /// /// This call is idempotent. @@ -64,10 +64,10 @@ namespace Impl { * be created in every translation unit that sees this header file (we make this * clear by marking it static, meaning no other translation unit can link to * it). Since the Kokkos_HIP_Locks.cpp translation unit cannot initialize the - * instances in other translation units, we must update this CUDA global + * instances in other translation units, we must update this HIP global * variable based on the Host global variable prior to running any kernels that * will use it. That is the purpose of the - * KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE macro. + * ensure_hip_lock_arrays_on_device function. */ __device__ #ifdef DESUL_HIP_RDC @@ -132,31 +132,34 @@ namespace Impl { namespace { static int lock_array_copied = 0; } // namespace -} // namespace Impl -} // namespace desul -/* It is critical that this code be a macro, so that it will - capture the right address for g_device_hip_lock_arrays! - putting this in an inline function will NOT do the right thing! */ -#define DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() \ - { \ - if (::desul::Impl::lock_array_copied == 0) { \ - (void)hipMemcpyToSymbol( \ - HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE), \ - &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE_h, \ - sizeof(int32_t*)); \ - (void)hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE), \ - &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE_h, \ - sizeof(int32_t*)); \ - } \ - ::desul::Impl::lock_array_copied = 1; \ +#ifdef DESUL_HIP_RDC +inline +#else +inline static +#endif + void + copy_hip_lock_arrays_to_device() { + if (lock_array_copied == 0) { + (void)hipMemcpyToSymbol(HIP_SYMBOL(HIP_SPACE_ATOMIC_LOCKS_DEVICE), + &HIP_SPACE_ATOMIC_LOCKS_DEVICE_h, + sizeof(int32_t*)); + (void)hipMemcpyToSymbol(HIP_SYMBOL(HIP_SPACE_ATOMIC_LOCKS_NODE), + &HIP_SPACE_ATOMIC_LOCKS_NODE_h, + sizeof(int32_t*)); } + lock_array_copied = 1; +} +} // namespace Impl #if defined(DESUL_HIP_RDC) || (!defined(__HIPCC__)) -#define DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() +inline void ensure_hip_lock_arrays_on_device() {} #else -#define DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \ - DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() +static inline void ensure_hip_lock_arrays_on_device() { + Impl::copy_hip_lock_arrays_to_device(); +} #endif +} // namespace desul + #endif diff --git a/tpls/desul/src/Lock_Array_HIP.cpp b/tpls/desul/src/Lock_Array_HIP.cpp index 5ccc6f7d54..986f5475ae 100644 --- a/tpls/desul/src/Lock_Array_HIP.cpp +++ b/tpls/desul/src/Lock_Array_HIP.cpp @@ -70,7 +70,7 @@ void init_lock_arrays_hip() { "init_lock_arrays_hip: hipMallocHost host locks"); auto error_sync1 = hipDeviceSynchronize(); - DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE(); + copy_hip_lock_arrays_to_device(); check_error_and_throw_hip(error_sync1, "init_lock_arrays_hip: post malloc"); init_lock_arrays_hip_kernel<<<(HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>(); @@ -89,7 +89,7 @@ void finalize_lock_arrays_hip() { HIP_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; #ifdef DESUL_HIP_RDC - DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE(); + copy_hip_lock_arrays_to_device(); #endif } From 5212d90b54cf50ceeb219131bc784845990a7b3e Mon Sep 17 00:00:00 2001 From: Dan Ibanez Date: Mon, 23 Jan 2023 21:26:32 -0700 Subject: [PATCH 082/496] Fix a bug in AVX512 simd_mask::operator[] this would cause a compiler error when instantiated because of const-correctness --- simd/src/Kokkos_SIMD_AVX512.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simd/src/Kokkos_SIMD_AVX512.hpp b/simd/src/Kokkos_SIMD_AVX512.hpp index 1df0730ac4..dab32a4a68 100644 --- a/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/simd/src/Kokkos_SIMD_AVX512.hpp @@ -114,7 +114,7 @@ class simd_mask> { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type operator[](std::size_t i) const { - return static_cast(reference(m_value, int(i))); + return reference(const_cast(this)->m_value, int(i)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask operator||(simd_mask const& other) const { From cb67caf6ae716f60647a7ea6e4e9fd2b7370f543 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 12:28:31 -0500 Subject: [PATCH 083/496] Warn at configuration time if attempting to disable desul atomics and force using it (#5801) * Error out at configuration time if attempting to disable desul atomics * Fixup set Kokkos_ENABLE_IMPL_DESUL_ATOMICS too * Fixup makefile comment need to be on a separate line --- Makefile.kokkos | 12 ++++++++++-- cmake/KokkosCore_config.h.in | 2 +- cmake/kokkos_enable_options.cmake | 10 ++++++---- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 37feada262..2a6c832867 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -23,7 +23,7 @@ KOKKOS_DEBUG ?= "no" KOKKOS_USE_TPLS ?= "" # Options: c++17,c++1z,c++20,c++2a,c++23,c++2b KOKKOS_CXX_STANDARD ?= "c++17" -# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align,disable_deprecated_code,enable_deprecation_warnings,disable_desul_atomics +# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align,disable_deprecated_code,enable_deprecation_warnings KOKKOS_OPTIONS ?= "" KOKKOS_CMAKE ?= "no" KOKKOS_TRIBITS ?= "no" @@ -86,6 +86,7 @@ KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPT KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch) # deprecated KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics) +# deprecated KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_desul_atomics) KOKKOS_INTERNAL_DISABLE_BUNDLED_MDSPAN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),impl_disable_bundled_mdspan) KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code) @@ -1172,8 +1173,15 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) endif +ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 1) + $(warning disable_desul_atomics option has been removed. Desul atomics cannot be disabled.) + KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS := 0 +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1) + $(warning enable_desul_atomics option has been removed. Desul atomics are always enabled.) +endif ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS // deprecated") KOKKOS_CPPFLAGS+=-I$(KOKKOS_PATH)/tpls/desul/include else ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1) $(error Contradictory Desul atomics options: KOKKOS_OPTIONS=$(KOKKOS_OPTIONS) ) diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 40b341d989..beddd4c4a1 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -50,7 +50,7 @@ #cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS #cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN -#cmakedefine KOKKOS_ENABLE_IMPL_DESUL_ATOMICS +#cmakedefine KOKKOS_ENABLE_IMPL_DESUL_ATOMICS // deprecated #cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION // deprecated #cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION #cmakedefine KOKKOS_ENABLE_IMPL_MDSPAN diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index f9f1bc5a8b..5897ae73de 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -26,9 +26,6 @@ KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID) # Put a check in just in case people are using this option KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE) -# Set the Default for Desul Atomics usage. -set(_DESUL_ATOMICS_DEFAULT ON) - KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") KOKKOS_ENABLE_OPTION(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") @@ -62,7 +59,6 @@ KOKKOS_ENABLE_OPTION(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time") # This option will go away eventually, but allows fallback to old implementation when needed. -KOKKOS_ENABLE_OPTION(IMPL_DESUL_ATOMICS ON "Whether to use desul based atomics - option only during beta") KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") KOKKOS_ENABLE_OPTION(IMPL_MDSPAN OFF "Whether to enable experimental mdspan support") @@ -166,3 +162,9 @@ IF(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") ENDIF() ENDIF() + +IF(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) + MESSAGE(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") +ENDIF() +set(KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ON) +set(Kokkos_ENABLE_IMPL_DESUL_ATOMICS ON) From 05cb3f5114897d622c9adaba7e0a17432b8e113d Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 12:51:26 -0500 Subject: [PATCH 084/496] Purge logic around desul atomics being enabled at configuration time --- Makefile.kokkos | 22 +++++----------------- cmake/kokkos_enable_options.cmake | 1 - cmake/kokkos_tpls.cmake | 2 +- core/src/CMakeLists.txt | 10 +++++----- 4 files changed, 11 insertions(+), 24 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 2a6c832867..0be0dc564a 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1097,10 +1097,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp) + KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp) - ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0) - KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp - endif KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) @@ -1180,12 +1178,8 @@ endif ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1) $(warning enable_desul_atomics option has been removed. Desul atomics are always enabled.) endif -ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS // deprecated") - KOKKOS_CPPFLAGS+=-I$(KOKKOS_PATH)/tpls/desul/include -else ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1) - $(error Contradictory Desul atomics options: KOKKOS_OPTIONS=$(KOKKOS_OPTIONS) ) -endif +tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS // deprecated") +KOKKOS_CPPFLAGS+=-I$(KOKKOS_PATH)/tpls/desul/include ifeq ($(KOKKOS_INTERNAL_DISABLE_BUNDLED_MDSPAN), 0) KOKKOS_CPPFLAGS+=-I$(KOKKOS_PATH)/tpls/mdspan/include @@ -1270,9 +1264,7 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp) ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp) - ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0) - KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp - endif + KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp) ifneq ($(CUDA_PATH),) KOKKOS_CPPLAGS += -I$(CUDA_PATH)/include @@ -1388,11 +1380,7 @@ KOKKOS_LIBS := -lkokkos ${KOKKOS_LIBS} # Generating the header DESUL_INTERNAL_CONFIG_TMP=Desul_Config.tmp -ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0) - DESUL_CONFIG_HEADER=desul/atomics/Config.hpp -else - DESUL_CONFIG_HEADER=NothingToSeeHereMoveAlong -endif +DESUL_CONFIG_HEADER=desul/atomics/Config.hpp desul_append_header = $(shell echo $1 >> $(DESUL_INTERNAL_CONFIG_TMP)) tmp := $(call desul_append_header, "// generated by on-demand build system by crtrott" > $(DESUL_INTERNAL_CONFIG_TMP)) tmp := $(call desul_append_header, "$H""ifndef DESUL_ATOMICS_CONFIG_HPP_") diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index 5897ae73de..517b9aaca5 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -167,4 +167,3 @@ IF(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) MESSAGE(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") ENDIF() set(KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ON) -set(Kokkos_ENABLE_IMPL_DESUL_ATOMICS ON) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index ba66ee4d38..ac06f0848f 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -88,7 +88,7 @@ IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) ENDIF() KOKKOS_IMPORT_TPL(LIBQUADMATH) -IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) +IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) find_package(desul REQUIRED COMPONENTS atomics) KOKKOS_EXPORT_CMAKE_TPL(desul REQUIRED COMPONENTS atomics) ENDIF() diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index 862c0c47dd..0f9de74707 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -3,7 +3,7 @@ KOKKOS_INCLUDE_DIRECTORIES( ${CMAKE_CURRENT_SOURCE_DIR} ${KOKKOS_TOP_BUILD_DIR} ) -IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND NOT desul_FOUND) +IF (NOT desul_FOUND) IF(KOKKOS_ENABLE_CUDA) SET(DESUL_ATOMICS_ENABLE_CUDA ON) ENDIF() @@ -88,7 +88,7 @@ IF (KOKKOS_ENABLE_SYCL) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) ENDIF() -IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND NOT desul_FOUND) +IF (NOT desul_FOUND) APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/*.cpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*.hpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*.hpp) @@ -125,7 +125,7 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND NOT desul_FOUND) +IF (NOT desul_FOUND) KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include ) @@ -187,11 +187,11 @@ ENDIF() # libatomic # Most compilers only require libatomic for 128-bit CAS # I (CT) had removed 128bit CAS from desul to not need libatomic. -IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND KOKKOS_ENABLE_OPENMPTARGET) +IF (KOKKOS_ENABLE_OPENMPTARGET) target_link_libraries(kokkoscore PUBLIC atomic) ENDIF() -IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND desul_FOUND) +IF (desul_FOUND) target_link_libraries(kokkoscore PUBLIC desul_atomics) ENDIF() From 4c03c8d1059ff71675d1cf7922cca3c5e0facce8 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 24 Jan 2023 17:35:02 +0000 Subject: [PATCH 085/496] KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED->KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED --- cmake/KokkosCore_config.h.in | 2 +- cmake/kokkos_arch.cmake | 4 ++-- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 4 ++-- core/unit_test/TestAtomicOperations_complexdouble.hpp | 3 ++- core/unit_test/TestAtomics.hpp | 3 ++- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 863b800a86..9162fefed4 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -26,7 +26,7 @@ #cmakedefine KOKKOS_ENABLE_MEMKIND #cmakedefine KOKKOS_ENABLE_LIBRT #cmakedefine KOKKOS_ENABLE_SYCL -#cmakedefine KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED +#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED /* General Settings */ #cmakedefine KOKKOS_ENABLE_CXX17 diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 6e754bd903..0bde612b8f 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -544,9 +544,9 @@ IF(KOKKOS_ENABLE_SYCL) int main(){ return 0; } " - KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED) + KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - IF(KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED) + IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) COMPILER_SPECIFIC_FLAGS( DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED ) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index ef3cd4a2b4..6a0e3b4934 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -127,7 +127,7 @@ void SYCLInternal::initialize(const sycl::queue& q) { Kokkos::Impl::throw_runtime_exception(msg.str()); } -#ifdef KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED +#ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED // Init the array for used for arbitrarily sized atomics if (this == &singleton()) { desul::Impl::init_lock_arrays(); @@ -170,7 +170,7 @@ void SYCLInternal::finalize() { // deallocated once by the defualt instance if (this == &singleton()) { Impl::sycl_global_unique_token_locks(true); -#ifdef KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED +#ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED desul::Impl::finalize_lock_arrays(); desul::Impl::finalize_lock_arrays_sycl(*m_queue); #endif diff --git a/core/unit_test/TestAtomicOperations_complexdouble.hpp b/core/unit_test/TestAtomicOperations_complexdouble.hpp index 1b3f52e2cc..852fade58b 100644 --- a/core/unit_test/TestAtomicOperations_complexdouble.hpp +++ b/core/unit_test/TestAtomicOperations_complexdouble.hpp @@ -18,7 +18,8 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_complexdouble) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED) +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) if (std::is_same_v) GTEST_SKIP() << "skipping since device_global variables are not available"; #endif diff --git a/core/unit_test/TestAtomics.hpp b/core/unit_test/TestAtomics.hpp index 479a698b52..fd94f40587 100644 --- a/core/unit_test/TestAtomics.hpp +++ b/core/unit_test/TestAtomics.hpp @@ -512,7 +512,8 @@ TEST(TEST_CATEGORY, atomics) { // FIXME_SYCL Replace macro by SYCL_EXT_ONEAPI_DEVICE_GLOBAL or remove // condition alltogether when possible. -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED) +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) if (std::is_same_v) return; #endif ASSERT_TRUE( From f253bc48fa72feadbbbf8280001168d7dd185232 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 24 Jan 2023 12:51:08 -0500 Subject: [PATCH 086/496] Print KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED in print_configuration --- core/src/SYCL/Kokkos_SYCL.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index e38b011c89..a1ab808c7d 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -88,6 +88,11 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { os << "\nRuntime Configuration:\n"; os << "macro KOKKOS_ENABLE_SYCL : defined\n"; +#ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED + os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : defined\n"; +#else + os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n"; +#endif if (verbose) SYCL::impl_sycl_info(os, m_space_instance->m_queue->get_device()); } From 153b4c1da4b1241aadda224d6ce3f9ede22df792 Mon Sep 17 00:00:00 2001 From: Dan Ibanez Date: Tue, 24 Jan 2023 12:00:01 -0700 Subject: [PATCH 087/496] remove const_cast with some code duplication --- simd/src/Kokkos_SIMD_AVX512.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/simd/src/Kokkos_SIMD_AVX512.hpp b/simd/src/Kokkos_SIMD_AVX512.hpp index dab32a4a68..a9e7ae1813 100644 --- a/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/simd/src/Kokkos_SIMD_AVX512.hpp @@ -114,7 +114,8 @@ class simd_mask> { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type operator[](std::size_t i) const { - return reference(const_cast(this)->m_value, int(i)); + auto const bit_mask = __mmask8(std::int16_t(1 << i)); + return (m_value & bit_mask) != 0; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask operator||(simd_mask const& other) const { From c54547ef4ea7a2e408a11420c43062e4b9613cc1 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 15:01:25 -0500 Subject: [PATCH 088/496] Fixup ROCm 5.4 ImplForceGlobalLaunch{Launch -> }_t typo in unit tests --- core/unit_test/TestMathematicalSpecialFunctions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestMathematicalSpecialFunctions.hpp b/core/unit_test/TestMathematicalSpecialFunctions.hpp index 071056ce34..7ae2027705 100644 --- a/core/unit_test/TestMathematicalSpecialFunctions.hpp +++ b/core/unit_test/TestMathematicalSpecialFunctions.hpp @@ -507,7 +507,7 @@ struct TestComplexBesselJ0Y0Function { // Call Bessel functions #if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 4) using Property = - Kokkos::Experimental::WorkItemProperty::ImplForceGlobalLaunchLaunch_t; + Kokkos::Experimental::WorkItemProperty::ImplForceGlobalLaunch_t; #else using Property = Kokkos::Experimental::WorkItemProperty::None_t; #endif From c3fe1d607b14f526a46c548ee6e1da17099dd554 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 12:57:30 -0500 Subject: [PATCH 089/496] Purge macro guards for desul atomics being enabled or not --- core/src/Cuda/Kokkos_Cuda_Locks.cpp | 5 +- core/src/Cuda/Kokkos_Cuda_Locks.hpp | 17 +- core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp | 12 +- core/src/HIP/Kokkos_HIP_Locks.cpp | 4 - core/src/HIP/Kokkos_HIP_Locks.hpp | 19 +- core/src/HIP/Kokkos_HIP_UniqueToken.hpp | 12 +- core/src/Kokkos_Atomic.hpp | 296 ------------------ .../Kokkos_Atomics_Desul_Volatile_Wrapper.hpp | 2 - core/src/Kokkos_Atomics_Desul_Wrapper.hpp | 2 - core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp | 8 - core/src/impl/Kokkos_Memory_Fence.hpp | 29 -- 11 files changed, 7 insertions(+), 399 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/core/src/Cuda/Kokkos_Cuda_Locks.cpp index b18fda80f0..f20b41cc48 100644 --- a/core/src/Cuda/Kokkos_Cuda_Locks.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Locks.cpp @@ -49,10 +49,9 @@ namespace Impl { CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0}; void initialize_host_cuda_lock_arrays() { -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS desul::Impl::init_lock_arrays(); desul::ensure_cuda_lock_arrays_on_device(); -#endif + if (g_host_cuda_lock_arrays.atomic != nullptr) return; KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMalloc(&g_host_cuda_lock_arrays.atomic, @@ -68,9 +67,7 @@ void initialize_host_cuda_lock_arrays() { } void finalize_host_cuda_lock_arrays() { -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS desul::Impl::finalize_lock_arrays(); -#endif if (g_host_cuda_lock_arrays.atomic == nullptr) return; cudaFree(g_host_cuda_lock_arrays.atomic); diff --git a/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/core/src/Cuda/Kokkos_Cuda_Locks.hpp index 4c17997fe2..08f88895e2 100644 --- a/core/src/Cuda/Kokkos_Cuda_Locks.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Locks.hpp @@ -25,9 +25,8 @@ #include -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS +// FIXME do not include private headers #include -#endif namespace Kokkos { namespace Impl { @@ -136,18 +135,6 @@ inline static lock_array_copied = 1; } -#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS - -#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -inline void ensure_cuda_lock_arrays_on_device() {} -#else -inline static void ensure_cuda_lock_arrays_on_device() { - copy_cuda_lock_arrays_to_device(); -} -#endif - -#else - #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE inline void ensure_cuda_lock_arrays_on_device() {} #else @@ -158,8 +145,6 @@ inline static void ensure_cuda_lock_arrays_on_device() { } #endif -#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */ - } // namespace Impl } // namespace Kokkos diff --git a/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp index 1ade4c34b6..8509f10c03 100644 --- a/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp +++ b/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp @@ -104,13 +104,9 @@ class UniqueToken { idx = idx % size(); } #endif -// Make sure that all writes in the previous lock owner are visible to me -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS + // Make sure that all writes in the previous lock owner are visible to me desul::atomic_thread_fence(desul::MemoryOrderAcquire(), desul::MemoryScopeDevice()); -#else - Kokkos::memory_fence(); -#endif return idx; } @@ -125,13 +121,9 @@ class UniqueToken { /// \brief release an acquired value KOKKOS_INLINE_FUNCTION void release(size_type idx) const noexcept { -// Make sure my writes are visible to the next lock owner -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS + // Make sure my writes are visible to the next lock owner desul::atomic_thread_fence(desul::MemoryOrderRelease(), desul::MemoryScopeDevice()); -#else - Kokkos::memory_fence(); -#endif (void)Kokkos::atomic_exchange(&m_locks(idx), 0); } }; diff --git a/core/src/HIP/Kokkos_HIP_Locks.cpp b/core/src/HIP/Kokkos_HIP_Locks.cpp index 62058b4161..3547286236 100644 --- a/core/src/HIP/Kokkos_HIP_Locks.cpp +++ b/core/src/HIP/Kokkos_HIP_Locks.cpp @@ -53,10 +53,8 @@ namespace Impl { HIPLockArrays g_host_hip_lock_arrays = {nullptr, 0}; void initialize_host_hip_lock_arrays() { -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS desul::Impl::init_lock_arrays(); desul::ensure_hip_lock_arrays_on_device(); -#endif if (g_host_hip_lock_arrays.atomic != nullptr) return; KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc( @@ -70,9 +68,7 @@ void initialize_host_hip_lock_arrays() { } void finalize_host_hip_lock_arrays() { -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS desul::Impl::finalize_lock_arrays(); -#endif if (g_host_hip_lock_arrays.atomic == nullptr) return; KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic)); diff --git a/core/src/HIP/Kokkos_HIP_Locks.hpp b/core/src/HIP/Kokkos_HIP_Locks.hpp index e0bce0ccfb..0ddd1c486d 100644 --- a/core/src/HIP/Kokkos_HIP_Locks.hpp +++ b/core/src/HIP/Kokkos_HIP_Locks.hpp @@ -23,9 +23,8 @@ #include -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS +// FIXME do not include private headers #include -#endif namespace Kokkos { namespace Impl { @@ -128,8 +127,6 @@ inline static lock_array_copied = 1; } -#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS - #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE inline void ensure_hip_lock_arrays_on_device() {} #else @@ -138,20 +135,6 @@ inline static void ensure_hip_lock_arrays_on_device() { } #endif -#else - -#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE -inline void ensure_hip_lock_arrays_on_device() {} -#else -// Still Need copy_hip_lock_arrays for team scratch etc. -inline static void ensure_hip_lock_arrays_on_device() { - copy_hip_lock_arrays_to_device(); - desul::ensure_hip_lock_arrays_on_device(); -} -#endif - -#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */ - } // namespace Impl } // namespace Kokkos diff --git a/core/src/HIP/Kokkos_HIP_UniqueToken.hpp b/core/src/HIP/Kokkos_HIP_UniqueToken.hpp index 13fc6216d6..313e5f5217 100644 --- a/core/src/HIP/Kokkos_HIP_UniqueToken.hpp +++ b/core/src/HIP/Kokkos_HIP_UniqueToken.hpp @@ -97,13 +97,9 @@ class UniqueToken { done_active = __ballot(done ? 1 : 0); } -// Make sure that all writes in the previous lock owner are visible to me -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS + // Make sure that all writes in the previous lock owner are visible to me desul::atomic_thread_fence(desul::MemoryOrderAcquire(), desul::MemoryScopeDevice()); -#else - Kokkos::memory_fence(); -#endif return idx; } @@ -118,13 +114,9 @@ class UniqueToken { /// \brief release an acquired value KOKKOS_INLINE_FUNCTION void release(size_type idx) const noexcept { -// Make sure my writes are visible to the next lock owner -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS + // Make sure my writes are visible to the next lock owner desul::atomic_thread_fence(desul::MemoryOrderRelease(), desul::MemoryScopeDevice()); -#else - Kokkos::memory_fence(); -#endif (void)Kokkos::atomic_exchange(m_locks.data() + idx, 0); } }; diff --git a/core/src/Kokkos_Atomic.hpp b/core/src/Kokkos_Atomic.hpp index 1347e09ebd..57d189cfcf 100644 --- a/core/src/Kokkos_Atomic.hpp +++ b/core/src/Kokkos_Atomic.hpp @@ -46,7 +46,6 @@ #include -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #include #include #include @@ -94,302 +93,7 @@ KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange( } // namespace Impl } // namespace Kokkos -#else -#include -#include - -//---------------------------------------------------------------------------- - -// Need to fix this for pure clang on windows -#if defined(_WIN32) -#define KOKKOS_ENABLE_WINDOWS_ATOMICS - -#if defined(KOKKOS_ENABLE_CUDA) -#define KOKKOS_ENABLE_CUDA_ATOMICS -#if defined(KOKKOS_COMPILER_CLANG) -#define KOKKOS_ENABLE_GNU_ATOMICS -#endif -#endif - -#else // _WIN32 -#if defined(KOKKOS_ENABLE_CUDA) - -// Compiling NVIDIA device code, must use Cuda atomics: - -#define KOKKOS_ENABLE_CUDA_ATOMICS - -#elif defined(KOKKOS_ENABLE_HIP) - -#define KOKKOS_ENABLE_HIP_ATOMICS - -#endif - -#if !defined(KOKKOS_ENABLE_GNU_ATOMICS) && \ - !defined(KOKKOS_ENABLE_INTEL_ATOMICS) && \ - !defined(KOKKOS_ENABLE_OPENMP_ATOMICS) && \ - !defined(KOKKOS_ENABLE_STD_ATOMICS) && \ - !defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - -// Compiling for non-Cuda atomic implementation has not been pre-selected. -// Choose the best implementation for the detected compiler. -// Preference: GCC, INTEL, OMP31 - -#if defined(KOKKOS_INTERNAL_NOT_PARALLEL) - -#define KOKKOS_ENABLE_SERIAL_ATOMICS - -#elif defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ - defined(KOKKOS_COMPILER_NVCC) - -#define KOKKOS_ENABLE_GNU_ATOMICS - -#elif defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_CRAYC) - -#define KOKKOS_ENABLE_INTEL_ATOMICS - -#elif defined(_OPENMP) && (201107 <= _OPENMP) - -#define KOKKOS_ENABLE_OPENMP_ATOMICS - -#else - -#error "KOKKOS_ATOMICS_USE : Unsupported compiler" - -#endif - -#endif /* Not pre-selected atomic implementation */ -#endif - -#ifdef KOKKOS_ENABLE_CUDA -#include -#endif - -namespace Kokkos { -template -KOKKOS_INLINE_FUNCTION void atomic_add(volatile T* const dest, const T src); - -// Atomic increment -template -KOKKOS_INLINE_FUNCTION void atomic_increment(volatile T* a); - -template -KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile T* a); -} // namespace Kokkos - -namespace Kokkos { - -inline const char* atomic_query_version() { -#if defined(KOKKOS_ENABLE_CUDA_ATOMICS) - return "KOKKOS_ENABLE_CUDA_ATOMICS"; -#elif defined(KOKKOS_ENABLE_GNU_ATOMICS) - return "KOKKOS_ENABLE_GNU_ATOMICS"; -#elif defined(KOKKOS_ENABLE_INTEL_ATOMICS) - return "KOKKOS_ENABLE_INTEL_ATOMICS"; -#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) - return "KOKKOS_ENABLE_OPENMP_ATOMICS"; -#elif defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) - return "KOKKOS_ENABLE_WINDOWS_ATOMICS"; -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - return "KOKKOS_ENABLE_SERIAL_ATOMICS"; -#else -#error "No valid response for atomic_query_version!" -#endif -} - -} // namespace Kokkos - -//---------------------------------------------------------------------------- -// Atomic Memory Orders -// -// Implements Strongly-typed analogs of C++ standard memory orders -#include "impl/Kokkos_Atomic_Memory_Order.hpp" - -#if defined(KOKKOS_ENABLE_HIP) -#include -#endif - -#if defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) -#include "impl/Kokkos_Atomic_Windows.hpp" -#endif -//---------------------------------------------------------------------------- -// Atomic Assembly -// -// Implements CAS128-bit in assembly - -#include "impl/Kokkos_Atomic_Assembly.hpp" - -//---------------------------------------------------------------------------- -// Memory fence -// -// All loads and stores from this thread will be globally consistent before -// continuing -// -// void memory_fence() {...}; -#include "impl/Kokkos_Memory_Fence.hpp" - -//---------------------------------------------------------------------------- -// Atomic exchange -// -// template< typename T > -// T atomic_exchange( volatile T* const dest , const T val ) -// { T tmp = *dest ; *dest = val ; return tmp ; } - -#include "impl/Kokkos_Atomic_Exchange.hpp" - -//---------------------------------------------------------------------------- -// Atomic compare-and-exchange -// -// template -// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, -// const T val) { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } -// return equal ; } - -#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp" - -#include "impl/Kokkos_Atomic_Generic.hpp" - -//---------------------------------------------------------------------------- -// Atomic fetch and add -// -// template -// T atomic_fetch_add(volatile T* const dest, const T val) -// { T tmp = *dest ; *dest += val ; return tmp ; } - -#include "impl/Kokkos_Atomic_Fetch_Add.hpp" - -//---------------------------------------------------------------------------- -// Atomic increment -// -// template -// T atomic_increment(volatile T* const dest) -// { dest++; } - -#include "impl/Kokkos_Atomic_Increment.hpp" - -//---------------------------------------------------------------------------- -// Atomic Decrement -// -// template -// T atomic_decrement(volatile T* const dest) -// { dest--; } - -#include "impl/Kokkos_Atomic_Decrement.hpp" - -//---------------------------------------------------------------------------- -// Atomic fetch and sub -// -// template -// T atomic_fetch_sub(volatile T* const dest, const T val) -// { T tmp = *dest ; *dest -= val ; return tmp ; } - -#include "impl/Kokkos_Atomic_Fetch_Sub.hpp" - -//---------------------------------------------------------------------------- -// Atomic fetch and or -// -// template -// T atomic_fetch_or(volatile T* const dest, const T val) -// { T tmp = *dest ; *dest = tmp | val ; return tmp ; } - -#include "impl/Kokkos_Atomic_Fetch_Or.hpp" - -//---------------------------------------------------------------------------- -// Atomic fetch and and -// -// template -// T atomic_fetch_and(volatile T* const dest, const T val) -// { T tmp = *dest ; *dest = tmp & val ; return tmp ; } - -#include "impl/Kokkos_Atomic_Fetch_And.hpp" - -//---------------------------------------------------------------------------- -// Atomic MinMax -// -// template -// T atomic_min(volatile T* const dest, const T val) -// { T tmp = *dest ; *dest = min(*dest, val); return tmp ; } -// template -// T atomic_max(volatile T* const dest, const T val) -// { T tmp = *dest ; *dest = max(*dest, val); return tmp ; } - -#include "impl/Kokkos_Atomic_MinMax.hpp" - -//---------------------------------------------------------------------------- -// Provide volatile_load and safe_load -// -// T volatile_load(T const volatile * const ptr); -// -// T const& safe_load(T const * const ptr); -// XEON PHI -// T safe_load(T const * const ptr - -#include "impl/Kokkos_Volatile_Load.hpp" - -//---------------------------------------------------------------------------- -// Provide atomic loads and stores with memory order semantics - -#include "impl/Kokkos_Atomic_Load.hpp" -#include "impl/Kokkos_Atomic_Store.hpp" - -// Generic functions using the above defined functions -#include "impl/Kokkos_Atomic_Generic_Secondary.hpp" -//---------------------------------------------------------------------------- -// This atomic-style macro should be an inlined function, not a macro - -#if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \ - !defined(__CUDA_ARCH__) - -#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr, 0, 0) -#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr, 1, 0) - -#else - -#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0) -#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0) - -#endif - -//---------------------------------------------------------------------------- - -// Helper functions for places where we really should have called SeqCst atomics -// anyway These can go away when we call desul unconditionally -namespace Kokkos { -namespace Impl { -struct MemoryOrderSeqCst {}; -struct MemoryScopeDevice {}; - -template -KOKKOS_INLINE_FUNCTION void desul_atomic_dec(T* dest, MemoryOrderSeqCst, - MemoryScopeDevice) { - return Kokkos::atomic_decrement(dest); -} - -template -KOKKOS_INLINE_FUNCTION void desul_atomic_inc(T* dest, MemoryOrderSeqCst, - MemoryScopeDevice) { - return Kokkos::atomic_increment(dest); -} - -template -KOKKOS_INLINE_FUNCTION T -desul_atomic_exchange(T* dest, Kokkos::Impl::type_identity_t val, - MemoryOrderSeqCst, MemoryScopeDevice) { - return Kokkos::atomic_exchange(dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange( - T* dest, Kokkos::Impl::type_identity_t compare, - Kokkos::Impl::type_identity_t val, MemoryOrderSeqCst, - MemoryScopeDevice) { - return Kokkos::atomic_compare_exchange(dest, compare, val); -} - -} // namespace Impl -} // namespace Kokkos - -#endif /* !KOKKOS_ENABLE_IMPL_DESUL_ATOMICS */ #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC diff --git a/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp index 9da4b06110..1c43474632 100644 --- a/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp +++ b/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp @@ -22,7 +22,6 @@ static_assert(false, #ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ #define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ #include -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #include #include @@ -195,5 +194,4 @@ T atomic_compare_exchange(volatile T* const dest, const T compare, const T desir #undef KOKKOS_DESUL_MEM_SCOPE // clang-format on -#endif // KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #endif diff --git a/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index fdc5e123f6..b8697d415a 100644 --- a/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -23,7 +23,6 @@ static_assert(false, #define KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_ #include -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #include #include @@ -278,5 +277,4 @@ namespace Impl { #undef KOKKOS_DESUL_MEM_SCOPE // clang-format on -#endif // KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #endif diff --git a/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp index 2f0a67b3dd..b29c687f40 100644 --- a/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp +++ b/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp @@ -93,12 +93,8 @@ class UniqueToken { } // Make sure that all writes in the previous lock owner are visible to me -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS desul::atomic_thread_fence(desul::MemoryOrderAcquire(), desul::MemoryScopeDevice()); -#else - Kokkos::memory_fence(); -#endif return idx; } @@ -114,12 +110,8 @@ class UniqueToken { KOKKOS_INLINE_FUNCTION void release(size_type idx) const noexcept { // Make sure my writes are visible to the next lock owner -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS desul::atomic_thread_fence(desul::MemoryOrderRelease(), desul::MemoryScopeDevice()); -#else - Kokkos::memory_fence(); -#endif (void)Kokkos::atomic_exchange(&m_locks(idx), 0); } }; diff --git a/core/src/impl/Kokkos_Memory_Fence.hpp b/core/src/impl/Kokkos_Memory_Fence.hpp index 392116a56e..42a53b04fb 100644 --- a/core/src/impl/Kokkos_Memory_Fence.hpp +++ b/core/src/impl/Kokkos_Memory_Fence.hpp @@ -19,35 +19,6 @@ #define KOKKOS_MEMORY_FENCE_HPP namespace Kokkos { -//---------------------------------------------------------------------------- -#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS -KOKKOS_FORCEINLINE_FUNCTION -void memory_fence() { -#if defined(__CUDA_ARCH__) - __threadfence(); -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) -#pragma omp flush -#elif defined(__HIP_DEVICE_COMPILE__) - __threadfence(); -#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__) - sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device); -#elif defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - asm volatile("mfence" ::: "memory"); -#elif defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ - (defined(KOKKOS_COMPILER_NVCC) && defined(KOKKOS_ENABLE_INTEL_ATOMICS)) - __sync_synchronize(); -#elif defined(KOKKOS_ENABLE_INTEL_ATOMICS) - _mm_mfence(); -#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) -#pragma omp flush -#elif defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) - MemoryBarrier(); -#elif !defined(KOKKOS_ENABLE_SERIAL_ATOMICS) -#error "Error: memory_fence() not defined" -#endif -} -#endif - ////////////////////////////////////////////////////// // store_fence() // From 44140f7212e3fb3a2a4e2f21094ac0895c568721 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 13:01:55 -0500 Subject: [PATCH 090/496] Get rid of #ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS in unit tests --- core/unit_test/TestAtomicOperations.hpp | 10 ---------- core/unit_test/TestAtomicOperations_unsignedint.hpp | 2 -- .../unit_test/TestAtomicOperations_unsignedlongint.hpp | 2 -- 3 files changed, 14 deletions(-) diff --git a/core/unit_test/TestAtomicOperations.hpp b/core/unit_test/TestAtomicOperations.hpp index b8940378bd..9965041e52 100644 --- a/core/unit_test/TestAtomicOperations.hpp +++ b/core/unit_test/TestAtomicOperations.hpp @@ -56,7 +56,6 @@ struct InitFunctor { //--------------------------------------------------- //--------------atomic_load/store/assign--------------------- //--------------------------------------------------- -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS template struct LoadStoreFunctor { using execution_space = DEVICE_TYPE; @@ -76,7 +75,6 @@ struct LoadStoreFunctor { } LoadStoreFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} }; -#endif template bool LoadStoreAtomicTest(T i0, T i1) { @@ -89,14 +87,10 @@ bool LoadStoreAtomicTest(T i0, T i1) { Kokkos::parallel_for(1, f_init); execution_space().fence(); -#ifdef KOKKOS_ENABLE_DESUL_ATOMICS struct LoadStoreFunctor f(i0, i1); f.data = data; Kokkos::parallel_for(1, f); -#else - h_data() = i1; -#endif Kokkos::deep_copy(h_data, data); @@ -332,10 +326,8 @@ struct WrappingIncFunctor { KOKKOS_INLINE_FUNCTION void operator()(int) const { -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS desul::atomic_fetch_inc_mod(&data(), (T)i1, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); -#endif } WrappingIncFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} @@ -480,10 +472,8 @@ struct WrappingDecFunctor { KOKKOS_INLINE_FUNCTION void operator()(int) const { -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS desul::atomic_fetch_dec_mod(&data(), (T)i1, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); -#endif } WrappingDecFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} diff --git a/core/unit_test/TestAtomicOperations_unsignedint.hpp b/core/unit_test/TestAtomicOperations_unsignedint.hpp index 31b2693d64..f844c9062d 100644 --- a/core/unit_test/TestAtomicOperations_unsignedint.hpp +++ b/core/unit_test/TestAtomicOperations_unsignedint.hpp @@ -45,14 +45,12 @@ TEST(TEST_CATEGORY, atomic_operations_unsigned) { unsigned int, TEST_EXECSPACE>(start, end - i, 12))); ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, TEST_EXECSPACE>(start, end - i, 13))); -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ASSERT_TRUE( (TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType< unsigned int, TEST_EXECSPACE>(start, end - i, 1))); // Wrapping Inc ASSERT_TRUE( (TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType< unsigned int, TEST_EXECSPACE>(start, end - i, 2))); // Wrapping Dec -#endif } } } // namespace Test diff --git a/core/unit_test/TestAtomicOperations_unsignedlongint.hpp b/core/unit_test/TestAtomicOperations_unsignedlongint.hpp index 98c2d28b20..8b6ca64e99 100644 --- a/core/unit_test/TestAtomicOperations_unsignedlongint.hpp +++ b/core/unit_test/TestAtomicOperations_unsignedlongint.hpp @@ -45,14 +45,12 @@ TEST(TEST_CATEGORY, atomic_operations_unsignedlong) { unsigned long int, TEST_EXECSPACE>(start, end - i, 12))); ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, TEST_EXECSPACE>(start, end - i, 13))); -#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType< unsigned long int, TEST_EXECSPACE>(start, end - i, 1))); // Wrapping Inc ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType< unsigned long int, TEST_EXECSPACE>(start, end - i, 2))); // Wrapping Dec -#endif } } } // namespace Test From 52953c824bac56ba6031c260901247f7259e9ef9 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 12:59:10 -0500 Subject: [PATCH 091/496] Remove a whole bunch of Kokkos leagacy atomics headers --- .../Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp | 979 ------------------ ...uda_Atomic_Intrinsics_Restore_Builtins.hpp | 41 - core/src/HIP/Kokkos_HIP_Atomic.hpp | 590 ----------- core/src/impl/Kokkos_Atomic_Assembly.hpp | 79 -- .../Kokkos_Atomic_Compare_Exchange_Strong.hpp | 409 -------- .../Kokkos_Atomic_Compare_Exchange_Weak.hpp | 380 ------- core/src/impl/Kokkos_Atomic_Decrement.hpp | 119 --- core/src/impl/Kokkos_Atomic_Exchange.hpp | 376 ------- core/src/impl/Kokkos_Atomic_Fetch_Add.hpp | 360 ------- core/src/impl/Kokkos_Atomic_Fetch_And.hpp | 164 --- core/src/impl/Kokkos_Atomic_Fetch_Or.hpp | 165 --- core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp | 295 ------ core/src/impl/Kokkos_Atomic_Generic.hpp | 527 ---------- .../impl/Kokkos_Atomic_Generic_Secondary.hpp | 58 -- core/src/impl/Kokkos_Atomic_Increment.hpp | 119 --- core/src/impl/Kokkos_Atomic_Load.hpp | 201 ---- core/src/impl/Kokkos_Atomic_MinMax.hpp | 291 ------ core/src/impl/Kokkos_Atomic_Store.hpp | 197 ---- core/src/impl/Kokkos_Atomic_Windows.hpp | 127 --- 19 files changed, 5477 deletions(-) delete mode 100644 core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp delete mode 100644 core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp delete mode 100644 core/src/HIP/Kokkos_HIP_Atomic.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Assembly.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Decrement.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Exchange.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Fetch_Add.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Fetch_And.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Fetch_Or.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Generic.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Generic_Secondary.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Increment.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Load.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_MinMax.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Store.hpp delete mode 100644 core/src/impl/Kokkos_Atomic_Windows.hpp diff --git a/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp b/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp deleted file mode 100644 index cb196f6e8f..0000000000 --- a/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp +++ /dev/null @@ -1,979 +0,0 @@ -/* -@HEADER -================================================================================ - -ORIGINAL LICENSE ----------------- - -Copyright (c) 2018, NVIDIA Corporation - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. - -================================================================================ - -LICENSE ASSOCIATED WITH SUBSEQUENT MODIFICATIONS ------------------------------------------------- - -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// ************************************************************************ -@HEADER -*/ - -#include -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) - -#include - -#ifndef _SIMT_DETAILS_CONFIG -#define _SIMT_DETAILS_CONFIG - -namespace Kokkos { -namespace Impl { - -#ifndef __simt_scope -// Modification: Kokkos GPU atomics should default to `gpu` scope -#define __simt_scope "gpu" -#endif - -#define __simt_fence_signal_() asm volatile("" ::: "memory") -#define __simt_fence_sc_() \ - asm volatile("fence.sc." __simt_scope ";" ::: "memory") -#define __simt_fence_() asm volatile("fence." __simt_scope ";" ::: "memory") - -#define __simt_load_acquire_8_as_32(ptr, ret) \ - asm volatile("ld.acquire." __simt_scope ".b8 %0, [%1];" \ - : "=r"(ret) \ - : "l"(ptr) \ - : "memory") -#define __simt_load_relaxed_8_as_32(ptr, ret) \ - asm volatile("ld.relaxed." __simt_scope ".b8 %0, [%1];" \ - : "=r"(ret) \ - : "l"(ptr) \ - : "memory") -#define __simt_store_release_8_as_32(ptr, desired) \ - asm volatile("st.release." __simt_scope ".b8 [%0], %1;" ::"l"(ptr), \ - "r"(desired) \ - : "memory") -#define __simt_store_relaxed_8_as_32(ptr, desired) \ - asm volatile("st.relaxed." __simt_scope ".b8 [%0], %1;" ::"l"(ptr), \ - "r"(desired) \ - : "memory") - -#define __simt_load_acquire_16(ptr, ret) \ - asm volatile("ld.acquire." __simt_scope ".b16 %0, [%1];" \ - : "=h"(ret) \ - : "l"(ptr) \ - : "memory") -#define __simt_load_relaxed_16(ptr, ret) \ - asm volatile("ld.relaxed." __simt_scope ".b16 %0, [%1];" \ - : "=h"(ret) \ - : "l"(ptr) \ - : "memory") -#define __simt_store_release_16(ptr, desired) \ - asm volatile("st.release." __simt_scope ".b16 [%0], %1;" ::"l"(ptr), \ - "h"(desired) \ - : "memory") -#define __simt_store_relaxed_16(ptr, desired) \ - asm volatile("st.relaxed." __simt_scope ".b16 [%0], %1;" ::"l"(ptr), \ - "h"(desired) \ - : "memory") - -#define __simt_load_acquire_32(ptr, ret) \ - asm volatile("ld.acquire." __simt_scope ".b32 %0, [%1];" \ - : "=r"(ret) \ - : "l"(ptr) \ - : "memory") -#define __simt_load_relaxed_32(ptr, ret) \ - asm volatile("ld.relaxed." __simt_scope ".b32 %0, [%1];" \ - : "=r"(ret) \ - : "l"(ptr) \ - : "memory") -#define __simt_store_release_32(ptr, desired) \ - asm volatile("st.release." __simt_scope ".b32 [%0], %1;" ::"l"(ptr), \ - "r"(desired) \ - : "memory") -#define __simt_store_relaxed_32(ptr, desired) \ - asm volatile("st.relaxed." __simt_scope ".b32 [%0], %1;" ::"l"(ptr), \ - "r"(desired) \ - : "memory") -#define __simt_exch_release_32(ptr, old, desired) \ - asm volatile("atom.exch.release." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(desired) \ - : "memory") -#define __simt_exch_acquire_32(ptr, old, desired) \ - asm volatile("atom.exch.acquire." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(desired) \ - : "memory") -#define __simt_exch_acq_rel_32(ptr, old, desired) \ - asm volatile("atom.exch.acq_rel." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(desired) \ - : "memory") -#define __simt_exch_relaxed_32(ptr, old, desired) \ - asm volatile("atom.exch.relaxed." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(desired) \ - : "memory") -#define __simt_cas_release_32(ptr, old, expected, desired) \ - asm volatile("atom.cas.release." __simt_scope ".b32 %0, [%1], %2, %3;" \ - : "=r"(old) \ - : "l"(ptr), "r"(expected), "r"(desired) \ - : "memory") -#define __simt_cas_acquire_32(ptr, old, expected, desired) \ - asm volatile("atom.cas.acquire." __simt_scope ".b32 %0, [%1], %2, %3;" \ - : "=r"(old) \ - : "l"(ptr), "r"(expected), "r"(desired) \ - : "memory") -#define __simt_cas_acq_rel_32(ptr, old, expected, desired) \ - asm volatile("atom.cas.acq_rel." __simt_scope ".b32 %0, [%1], %2, %3;" \ - : "=r"(old) \ - : "l"(ptr), "r"(expected), "r"(desired) \ - : "memory") -#define __simt_cas_relaxed_32(ptr, old, expected, desired) \ - asm volatile("atom.cas.relaxed." __simt_scope ".b32 %0, [%1], %2, %3;" \ - : "=r"(old) \ - : "l"(ptr), "r"(expected), "r"(desired) \ - : "memory") -#define __simt_add_release_32(ptr, old, addend) \ - asm volatile("atom.add.release." __simt_scope ".u32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(addend) \ - : "memory") -#define __simt_add_acquire_32(ptr, old, addend) \ - asm volatile("atom.add.acquire." __simt_scope ".u32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(addend) \ - : "memory") -#define __simt_add_acq_rel_32(ptr, old, addend) \ - asm volatile("atom.add.acq_rel." __simt_scope ".u32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(addend) \ - : "memory") -#define __simt_add_relaxed_32(ptr, old, addend) \ - asm volatile("atom.add.relaxed." __simt_scope ".u32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(addend) \ - : "memory") -#define __simt_and_release_32(ptr, old, andend) \ - asm volatile("atom.and.release." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(andend) \ - : "memory") -#define __simt_and_acquire_32(ptr, old, andend) \ - asm volatile("atom.and.acquire." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(andend) \ - : "memory") -#define __simt_and_acq_rel_32(ptr, old, andend) \ - asm volatile("atom.and.acq_rel." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(andend) \ - : "memory") -#define __simt_and_relaxed_32(ptr, old, andend) \ - asm volatile("atom.and.relaxed." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(andend) \ - : "memory") -#define __simt_or_release_32(ptr, old, orend) \ - asm volatile("atom.or.release." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(orend) \ - : "memory") -#define __simt_or_acquire_32(ptr, old, orend) \ - asm volatile("atom.or.acquire." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(orend) \ - : "memory") -#define __simt_or_acq_rel_32(ptr, old, orend) \ - asm volatile("atom.or.acq_rel." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(orend) \ - : "memory") -#define __simt_or_relaxed_32(ptr, old, orend) \ - asm volatile("atom.or.relaxed." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(orend) \ - : "memory") -#define __simt_xor_release_32(ptr, old, xorend) \ - asm volatile("atom.xor.release." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(xorend) \ - : "memory") -#define __simt_xor_acquire_32(ptr, old, xorend) \ - asm volatile("atom.xor.acquire." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(xorend) \ - : "memory") -#define __simt_xor_acq_rel_32(ptr, old, xorend) \ - asm volatile("atom.xor.acq_rel." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(xorend) \ - : "memory") -#define __simt_xor_relaxed_32(ptr, old, xorend) \ - asm volatile("atom.xor.relaxed." __simt_scope ".b32 %0, [%1], %2;" \ - : "=r"(old) \ - : "l"(ptr), "r"(xorend) \ - : "memory") - -#define __simt_load_acquire_64(ptr, ret) \ - asm volatile("ld.acquire." __simt_scope ".b64 %0, [%1];" \ - : "=l"(ret) \ - : "l"(ptr) \ - : "memory") -#define __simt_load_relaxed_64(ptr, ret) \ - asm volatile("ld.relaxed." __simt_scope ".b64 %0, [%1];" \ - : "=l"(ret) \ - : "l"(ptr) \ - : "memory") -#define __simt_store_release_64(ptr, desired) \ - asm volatile("st.release." __simt_scope ".b64 [%0], %1;" ::"l"(ptr), \ - "l"(desired) \ - : "memory") -#define __simt_store_relaxed_64(ptr, desired) \ - asm volatile("st.relaxed." __simt_scope ".b64 [%0], %1;" ::"l"(ptr), \ - "l"(desired) \ - : "memory") -#define __simt_exch_release_64(ptr, old, desired) \ - asm volatile("atom.exch.release." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(desired) \ - : "memory") -#define __simt_exch_acquire_64(ptr, old, desired) \ - asm volatile("atom.exch.acquire." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(desired) \ - : "memory") -#define __simt_exch_acq_rel_64(ptr, old, desired) \ - asm volatile("atom.exch.acq_rel." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(desired) \ - : "memory") -#define __simt_exch_relaxed_64(ptr, old, desired) \ - asm volatile("atom.exch.relaxed." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(desired) \ - : "memory") -#define __simt_cas_release_64(ptr, old, expected, desired) \ - asm volatile("atom.cas.release." __simt_scope ".b64 %0, [%1], %2, %3;" \ - : "=l"(old) \ - : "l"(ptr), "l"(expected), "l"(desired) \ - : "memory") -#define __simt_cas_acquire_64(ptr, old, expected, desired) \ - asm volatile("atom.cas.acquire." __simt_scope ".b64 %0, [%1], %2, %3;" \ - : "=l"(old) \ - : "l"(ptr), "l"(expected), "l"(desired) \ - : "memory") -#define __simt_cas_acq_rel_64(ptr, old, expected, desired) \ - asm volatile("atom.cas.acq_rel." __simt_scope ".b64 %0, [%1], %2, %3;" \ - : "=l"(old) \ - : "l"(ptr), "l"(expected), "l"(desired) \ - : "memory") -#define __simt_cas_relaxed_64(ptr, old, expected, desired) \ - asm volatile("atom.cas.relaxed." __simt_scope ".b64 %0, [%1], %2, %3;" \ - : "=l"(old) \ - : "l"(ptr), "l"(expected), "l"(desired) \ - : "memory") -#define __simt_add_release_64(ptr, old, addend) \ - asm volatile("atom.add.release." __simt_scope ".u64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(addend) \ - : "memory") -#define __simt_add_acquire_64(ptr, old, addend) \ - asm volatile("atom.add.acquire." __simt_scope ".u64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(addend) \ - : "memory") -#define __simt_add_acq_rel_64(ptr, old, addend) \ - asm volatile("atom.add.acq_rel." __simt_scope ".u64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(addend) \ - : "memory") -#define __simt_add_relaxed_64(ptr, old, addend) \ - asm volatile("atom.add.relaxed." __simt_scope ".u64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(addend) \ - : "memory") -#define __simt_and_release_64(ptr, old, andend) \ - asm volatile("atom.and.release." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(andend) \ - : "memory") -#define __simt_and_acquire_64(ptr, old, andend) \ - asm volatile("atom.and.acquire." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(andend) \ - : "memory") -#define __simt_and_acq_rel_64(ptr, old, andend) \ - asm volatile("atom.and.acq_rel." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(andend) \ - : "memory") -#define __simt_and_relaxed_64(ptr, old, andend) \ - asm volatile("atom.and.relaxed." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(andend) \ - : "memory") -#define __simt_or_release_64(ptr, old, orend) \ - asm volatile("atom.or.release." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(orend) \ - : "memory") -#define __simt_or_acquire_64(ptr, old, orend) \ - asm volatile("atom.or.acquire." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(orend) \ - : "memory") -#define __simt_or_acq_rel_64(ptr, old, orend) \ - asm volatile("atom.or.acq_rel." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(orend) \ - : "memory") -#define __simt_or_relaxed_64(ptr, old, orend) \ - asm volatile("atom.or.relaxed." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(orend) \ - : "memory") -#define __simt_xor_release_64(ptr, old, xorend) \ - asm volatile("atom.xor.release." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(xorend) \ - : "memory") -#define __simt_xor_acquire_64(ptr, old, xorend) \ - asm volatile("atom.xor.acquire." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(xorend) \ - : "memory") -#define __simt_xor_acq_rel_64(ptr, old, xorend) \ - asm volatile("atom.xor.acq_rel." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(xorend) \ - : "memory") -#define __simt_xor_relaxed_64(ptr, old, xorend) \ - asm volatile("atom.xor.relaxed." __simt_scope ".b64 %0, [%1], %2;" \ - : "=l"(old) \ - : "l"(ptr), "l"(xorend) \ - : "memory") - -#define __simt_nanosleep(timeout) \ - asm volatile("nanosleep.u32 %0;" ::"r"(unsigned(timeout)) :) - -/* - definitions -*/ - -#ifndef __GCC_ATOMIC_BOOL_LOCK_FREE -#define __GCC_ATOMIC_BOOL_LOCK_FREE 2 -#define __GCC_ATOMIC_CHAR_LOCK_FREE 2 -#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2 -#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2 -#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2 -#define __GCC_ATOMIC_SHORT_LOCK_FREE 2 -#define __GCC_ATOMIC_INT_LOCK_FREE 2 -#define __GCC_ATOMIC_LONG_LOCK_FREE 2 -#define __GCC_ATOMIC_LLONG_LOCK_FREE 2 -#define __GCC_ATOMIC_POINTER_LOCK_FREE 2 -#endif - -#ifndef __ATOMIC_RELAXED -#define __ATOMIC_RELAXED 0 -#define __ATOMIC_CONSUME 1 -#define __ATOMIC_ACQUIRE 2 -#define __ATOMIC_RELEASE 3 -#define __ATOMIC_ACQ_REL 4 -#define __ATOMIC_SEQ_CST 5 -#endif - -inline __device__ int __stronger_order_simt_(int a, int b) { - if (b == __ATOMIC_SEQ_CST) return __ATOMIC_SEQ_CST; - if (b == __ATOMIC_RELAXED) return a; - switch (a) { - case __ATOMIC_SEQ_CST: - case __ATOMIC_ACQ_REL: return a; - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: - if (b != __ATOMIC_ACQUIRE) - return __ATOMIC_ACQ_REL; - else - return __ATOMIC_ACQUIRE; - case __ATOMIC_RELEASE: - if (b != __ATOMIC_RELEASE) - return __ATOMIC_ACQ_REL; - else - return __ATOMIC_RELEASE; - case __ATOMIC_RELAXED: return b; - default: assert(0); - } - return __ATOMIC_SEQ_CST; -} - -/* - base -*/ - -#define DO__atomic_load_simt_(bytes, bits) \ - template = 0> \ - void __device__ __atomic_load_simt_(const type *ptr, type *ret, \ - int memorder) { \ - int##bits##_t tmp = 0; \ - switch (memorder) { \ - case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ - case __ATOMIC_CONSUME: \ - case __ATOMIC_ACQUIRE: __simt_load_acquire_##bits(ptr, tmp); break; \ - case __ATOMIC_RELAXED: __simt_load_relaxed_##bits(ptr, tmp); break; \ - default: assert(0); \ - } \ - memcpy(ret, &tmp, bytes); \ - } -DO__atomic_load_simt_(1, 32) DO__atomic_load_simt_(2, 16) - DO__atomic_load_simt_(4, 32) DO__atomic_load_simt_(8, 64) - - template - type __device__ __atomic_load_n_simt_(const type *ptr, int memorder) { - type ret; - __atomic_load_simt_(ptr, &ret, memorder); - return ret; -} - -#define DO__atomic_store_simt_(bytes, bits) \ - template = 0> \ - void __device__ __atomic_store_simt_(type *ptr, type *val, int memorder) { \ - int##bits##_t tmp = 0; \ - memcpy(&tmp, val, bytes); \ - switch (memorder) { \ - case __ATOMIC_RELEASE: __simt_store_release_##bits(ptr, tmp); break; \ - case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ - case __ATOMIC_RELAXED: __simt_store_relaxed_##bits(ptr, tmp); break; \ - default: assert(0); \ - } \ - } -DO__atomic_store_simt_(1, 32) DO__atomic_store_simt_(2, 16) - DO__atomic_store_simt_(4, 32) DO__atomic_store_simt_(8, 64) - - template - void __device__ - __atomic_store_n_simt_(type *ptr, type val, int memorder) { - __atomic_store_simt_(ptr, &val, memorder); -} - -#define DO__atomic_compare_exchange_simt_(bytes, bits) \ - template = 0> \ - bool __device__ __atomic_compare_exchange_simt_( \ - type *ptr, type *expected, const type *desired, bool, \ - int success_memorder, int failure_memorder) { \ - int##bits##_t tmp = 0, old = 0, old_tmp; \ - memcpy(&tmp, desired, bytes); \ - memcpy(&old, expected, bytes); \ - old_tmp = old; \ - switch (__stronger_order_simt_(success_memorder, failure_memorder)) { \ - case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ - case __ATOMIC_CONSUME: \ - case __ATOMIC_ACQUIRE: \ - __simt_cas_acquire_##bits(ptr, old, old_tmp, tmp); \ - break; \ - case __ATOMIC_ACQ_REL: \ - __simt_cas_acq_rel_##bits(ptr, old, old_tmp, tmp); \ - break; \ - case __ATOMIC_RELEASE: \ - __simt_cas_release_##bits(ptr, old, old_tmp, tmp); \ - break; \ - case __ATOMIC_RELAXED: \ - __simt_cas_relaxed_##bits(ptr, old, old_tmp, tmp); \ - break; \ - default: assert(0); \ - } \ - bool const ret = old == old_tmp; \ - memcpy(expected, &old, bytes); \ - return ret; \ - } -DO__atomic_compare_exchange_simt_(4, 32) - DO__atomic_compare_exchange_simt_(8, 64) - - template = 0> - bool __device__ - __atomic_compare_exchange_simt_(type *ptr, type *expected, - const type *desired, bool, - int success_memorder, - int failure_memorder) { - using R = std::conditional_t::value, volatile uint32_t, - uint32_t>; - auto const aligned = (R *)((intptr_t)ptr & ~(sizeof(uint32_t) - 1)); - auto const offset = uint32_t((intptr_t)ptr & (sizeof(uint32_t) - 1)) * 8; - auto const mask = ((1 << sizeof(type) * 8) - 1) << offset; - - uint32_t old = *expected << offset, old_value; - while (1) { - old_value = (old & mask) >> offset; - if (old_value != *expected) break; - uint32_t const attempt = (old & ~mask) | (*desired << offset); - if (__atomic_compare_exchange_simt_(aligned, &old, &attempt, true, - success_memorder, failure_memorder)) - return true; - } - *expected = old_value; - return false; -} - -template -bool __device__ __atomic_compare_exchange_n_simt_(type *ptr, type *expected, - type desired, bool weak, - int success_memorder, - int failure_memorder) { - return __atomic_compare_exchange_simt_(ptr, expected, &desired, weak, - success_memorder, failure_memorder); -} - -#define DO__atomic_exchange_simt_(bytes, bits) \ - template = 0> \ - void __device__ __atomic_exchange_simt_(type *ptr, type *val, type *ret, \ - int memorder) { \ - int##bits##_t tmp = 0; \ - memcpy(&tmp, val, bytes); \ - switch (memorder) { \ - case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ - case __ATOMIC_CONSUME: \ - case __ATOMIC_ACQUIRE: __simt_exch_acquire_##bits(ptr, tmp, tmp); break; \ - case __ATOMIC_ACQ_REL: __simt_exch_acq_rel_##bits(ptr, tmp, tmp); break; \ - case __ATOMIC_RELEASE: __simt_exch_release_##bits(ptr, tmp, tmp); break; \ - case __ATOMIC_RELAXED: __simt_exch_relaxed_##bits(ptr, tmp, tmp); break; \ - default: assert(0); \ - } \ - memcpy(ret, &tmp, bytes); \ - } -DO__atomic_exchange_simt_(4, 32) DO__atomic_exchange_simt_(8, 64) - - template = 0> - void __device__ - __atomic_exchange_simt_(type *ptr, type *val, type *ret, int memorder) { - type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED); - while (!__atomic_compare_exchange_simt_(ptr, &expected, val, true, memorder, - memorder)) - ; - *ret = expected; -} - -template -type __device__ __atomic_exchange_n_simt_(type *ptr, type val, int memorder) { - type ret; - __atomic_exchange_simt_(ptr, &val, &ret, memorder); - return ret; -} - -#define DO__atomic_fetch_add_simt_(bytes, bits) \ - template = 0> \ - type __device__ __atomic_fetch_add_simt_(type *ptr, delta val, \ - int memorder) { \ - type ret; \ - switch (memorder) { \ - case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ - case __ATOMIC_CONSUME: \ - case __ATOMIC_ACQUIRE: __simt_add_acquire_##bits(ptr, ret, val); break; \ - case __ATOMIC_ACQ_REL: __simt_add_acq_rel_##bits(ptr, ret, val); break; \ - case __ATOMIC_RELEASE: __simt_add_release_##bits(ptr, ret, val); break; \ - case __ATOMIC_RELAXED: __simt_add_relaxed_##bits(ptr, ret, val); break; \ - default: assert(0); \ - } \ - return ret; \ - } -DO__atomic_fetch_add_simt_(4, 32) DO__atomic_fetch_add_simt_(8, 64) - - template = 0> - type __device__ - __atomic_fetch_add_simt_(type *ptr, delta val, int memorder) { - type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED); - type const desired = expected + val; - while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, - memorder, memorder)) - ; - return expected; -} - -#define DO__atomic_fetch_sub_simt_(bytes, bits) \ - template = 0> \ - type __device__ __atomic_fetch_sub_simt_(type *ptr, delta val, \ - int memorder) { \ - type ret; \ - switch (memorder) { \ - case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ - case __ATOMIC_CONSUME: \ - case __ATOMIC_ACQUIRE: __simt_add_acquire_##bits(ptr, ret, -val); break; \ - case __ATOMIC_ACQ_REL: __simt_add_acq_rel_##bits(ptr, ret, -val); break; \ - case __ATOMIC_RELEASE: __simt_add_release_##bits(ptr, ret, -val); break; \ - case __ATOMIC_RELAXED: __simt_add_relaxed_##bits(ptr, ret, -val); break; \ - default: assert(0); \ - } \ - return ret; \ - } -DO__atomic_fetch_sub_simt_(4, 32) DO__atomic_fetch_sub_simt_(8, 64) - - template = 0> - type __device__ - __atomic_fetch_sub_simt_(type *ptr, delta val, int memorder) { - type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED); - type const desired = expected - val; - while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, - memorder, memorder)) - ; - return expected; -} - -#define DO__atomic_fetch_and_simt_(bytes, bits) \ - template = 0> \ - type __device__ __atomic_fetch_and_simt_(type *ptr, type val, \ - int memorder) { \ - type ret; \ - switch (memorder) { \ - case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ - case __ATOMIC_CONSUME: \ - case __ATOMIC_ACQUIRE: __simt_and_acquire_##bits(ptr, ret, val); break; \ - case __ATOMIC_ACQ_REL: __simt_and_acq_rel_##bits(ptr, ret, val); break; \ - case __ATOMIC_RELEASE: __simt_and_release_##bits(ptr, ret, val); break; \ - case __ATOMIC_RELAXED: __simt_and_relaxed_##bits(ptr, ret, val); break; \ - default: assert(0); \ - } \ - return ret; \ - } -DO__atomic_fetch_and_simt_(4, 32) DO__atomic_fetch_and_simt_(8, 64) - - template = 0> - type __device__ - __atomic_fetch_and_simt_(type *ptr, delta val, int memorder) { - type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED); - type const desired = expected & val; - while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, - memorder, memorder)) - ; - return expected; -} - -#define DO__atomic_fetch_xor_simt_(bytes, bits) \ - template = 0> \ - type __device__ __atomic_fetch_xor_simt_(type *ptr, type val, \ - int memorder) { \ - type ret; \ - switch (memorder) { \ - case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ - case __ATOMIC_CONSUME: \ - case __ATOMIC_ACQUIRE: __simt_xor_acquire_##bits(ptr, ret, val); break; \ - case __ATOMIC_ACQ_REL: __simt_xor_acq_rel_##bits(ptr, ret, val); break; \ - case __ATOMIC_RELEASE: __simt_xor_release_##bits(ptr, ret, val); break; \ - case __ATOMIC_RELAXED: __simt_xor_relaxed_##bits(ptr, ret, val); break; \ - default: assert(0); \ - } \ - return ret; \ - } -DO__atomic_fetch_xor_simt_(4, 32) DO__atomic_fetch_xor_simt_(8, 64) - - template = 0> - type __device__ - __atomic_fetch_xor_simt_(type *ptr, delta val, int memorder) { - type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED); - type const desired = expected ^ val; - while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, - memorder, memorder)) - ; - return expected; -} - -#define DO__atomic_fetch_or_simt_(bytes, bits) \ - template = 0> \ - type __device__ __atomic_fetch_or_simt_(type *ptr, type val, int memorder) { \ - type ret; \ - switch (memorder) { \ - case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ - case __ATOMIC_CONSUME: \ - case __ATOMIC_ACQUIRE: __simt_or_acquire_##bits(ptr, ret, val); break; \ - case __ATOMIC_ACQ_REL: __simt_or_acq_rel_##bits(ptr, ret, val); break; \ - case __ATOMIC_RELEASE: __simt_or_release_##bits(ptr, ret, val); break; \ - case __ATOMIC_RELAXED: __simt_or_relaxed_##bits(ptr, ret, val); break; \ - default: assert(0); \ - } \ - return ret; \ - } -DO__atomic_fetch_or_simt_(4, 32) DO__atomic_fetch_or_simt_(8, 64) - - template = 0> - type __device__ - __atomic_fetch_or_simt_(type *ptr, delta val, int memorder) { - type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED); - type const desired = expected | val; - while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, - memorder, memorder)) - ; - return expected; -} - -template -inline bool __device__ __atomic_test_and_set_simt_(type *ptr, int memorder) { - return __atomic_exchange_n_simt_((char *)ptr, (char)1, memorder) == 1; -} -template -inline void __device__ __atomic_clear_simt_(type *ptr, int memorder) { - return __atomic_store_n_simt_((char *)ptr, (char)0, memorder); -} - -inline constexpr __device__ bool __atomic_always_lock_free_simt_(size_t size, - void *) { - return size <= 8; -} -inline __device__ bool __atomic_is_lock_free_simt_(size_t size, void *ptr) { - return __atomic_always_lock_free_simt_(size, ptr); -} - -/* - fences -*/ - -inline void __device__ __atomic_thread_fence_simt(int memorder) { - switch (memorder) { - case __ATOMIC_SEQ_CST: __simt_fence_sc_(); break; - case __ATOMIC_CONSUME: - case __ATOMIC_ACQUIRE: - case __ATOMIC_ACQ_REL: - case __ATOMIC_RELEASE: __simt_fence_(); break; - case __ATOMIC_RELAXED: break; - default: assert(0); - } -} -inline void __device__ __atomic_signal_fence_simt(int memorder) { - __atomic_thread_fence_simt(memorder); -} - -/* - non-volatile -*/ - -template -type __device__ __atomic_load_n_simt(const type *ptr, int memorder) { - return __atomic_load_n_simt_(const_cast(ptr), memorder); -} -template -void __device__ __atomic_load_simt(const type *ptr, type *ret, int memorder) { - __atomic_load_simt_(const_cast(ptr), ret, memorder); -} -template -void __device__ __atomic_store_n_simt(type *ptr, type val, int memorder) { - __atomic_store_n_simt_(const_cast(ptr), val, memorder); -} -template -void __device__ __atomic_store_simt(type *ptr, type *val, int memorder) { - __atomic_store_simt_(const_cast(ptr), val, memorder); -} -template -type __device__ __atomic_exchange_n_simt(type *ptr, type val, int memorder) { - return __atomic_exchange_n_simt_(const_cast(ptr), val, memorder); -} -template -void __device__ __atomic_exchange_simt(type *ptr, type *val, type *ret, - int memorder) { - __atomic_exchange_simt_(const_cast(ptr), val, ret, memorder); -} -template -bool __device__ __atomic_compare_exchange_n_simt(type *ptr, type *expected, - type desired, bool weak, - int success_memorder, - int failure_memorder) { - return __atomic_compare_exchange_n_simt_(const_cast(ptr), expected, - desired, weak, success_memorder, - failure_memorder); -} -template -bool __device__ __atomic_compare_exchange_simt(type *ptr, type *expected, - type *desired, bool weak, - int success_memorder, - int failure_memorder) { - return __atomic_compare_exchange_simt_(const_cast(ptr), expected, - desired, weak, success_memorder, - failure_memorder); -} -template -type __device__ __atomic_fetch_add_simt(type *ptr, delta val, int memorder) { - return __atomic_fetch_add_simt_(const_cast(ptr), val, memorder); -} -template -type __device__ __atomic_fetch_sub_simt(type *ptr, delta val, int memorder) { - return __atomic_fetch_sub_simt_(const_cast(ptr), val, memorder); -} -template -type __device__ __atomic_fetch_and_simt(type *ptr, type val, int memorder) { - return __atomic_fetch_and_simt_(const_cast(ptr), val, memorder); -} -template -type __device__ __atomic_fetch_xor_simt(type *ptr, type val, int memorder) { - return __atomic_fetch_xor_simt_(const_cast(ptr), val, memorder); -} -template -type __device__ __atomic_fetch_or_simt(type *ptr, type val, int memorder) { - return __atomic_fetch_or_simt_(const_cast(ptr), val, memorder); -} -template -bool __device__ __atomic_test_and_set_simt(void *ptr, int memorder) { - return __atomic_test_and_set_simt_(const_cast(ptr), memorder); -} -template -void __device__ __atomic_clear_simt(void *ptr, int memorder) { - return __atomic_clear_simt_(const_cast(ptr), memorder); -} -inline bool __device__ __atomic_always_lock_free_simt(size_t size, void *ptr) { - return __atomic_always_lock_free_simt_(size, const_cast(ptr)); -} -inline bool __device__ __atomic_is_lock_free_simt(size_t size, void *ptr) { - return __atomic_is_lock_free_simt_(size, const_cast(ptr)); -} - -/* - volatile -*/ - -template -type __device__ __atomic_load_n_simt(const volatile type *ptr, int memorder) { - return __atomic_load_n_simt_(const_cast(ptr), memorder); -} -template -void __device__ __atomic_load_simt(const volatile type *ptr, type *ret, - int memorder) { - __atomic_load_simt_(const_cast(ptr), ret, memorder); -} -template -void __device__ __atomic_store_n_simt(volatile type *ptr, type val, - int memorder) { - __atomic_store_n_simt_(const_cast(ptr), val, memorder); -} -template -void __device__ __atomic_store_simt(volatile type *ptr, type *val, - int memorder) { - __atomic_store_simt_(const_cast(ptr), val, memorder); -} -template -type __device__ __atomic_exchange_n_simt(volatile type *ptr, type val, - int memorder) { - return __atomic_exchange_n_simt_(const_cast(ptr), val, memorder); -} -template -void __device__ __atomic_exchange_simt(volatile type *ptr, type *val, type *ret, - int memorder) { - __atomic_exchange_simt_(const_cast(ptr), val, ret, memorder); -} -template -bool __device__ __atomic_compare_exchange_n_simt(volatile type *ptr, - type *expected, type desired, - bool weak, - int success_memorder, - int failure_memorder) { - return __atomic_compare_exchange_n_simt_(const_cast(ptr), expected, - desired, weak, success_memorder, - failure_memorder); -} -template -bool __device__ __atomic_compare_exchange_simt(volatile type *ptr, - type *expected, type *desired, - bool weak, int success_memorder, - int failure_memorder) { - return __atomic_compare_exchange_simt_(const_cast(ptr), expected, - desired, weak, success_memorder, - failure_memorder); -} -template -type __device__ __atomic_fetch_add_simt(volatile type *ptr, delta val, - int memorder) { - return __atomic_fetch_add_simt_(const_cast(ptr), val, memorder); -} -template -type __device__ __atomic_fetch_sub_simt(volatile type *ptr, delta val, - int memorder) { - return __atomic_fetch_sub_simt_(const_cast(ptr), val, memorder); -} -template -type __device__ __atomic_fetch_and_simt(volatile type *ptr, type val, - int memorder) { - return __atomic_fetch_and_simt_(const_cast(ptr), val, memorder); -} -template -type __device__ __atomic_fetch_xor_simt(volatile type *ptr, type val, - int memorder) { - return __atomic_fetch_xor_simt_(const_cast(ptr), val, memorder); -} -template -type __device__ __atomic_fetch_or_simt(volatile type *ptr, type val, - int memorder) { - return __atomic_fetch_or_simt_(const_cast(ptr), val, memorder); -} -template -bool __device__ __atomic_test_and_set_simt(volatile void *ptr, int memorder) { - return __atomic_test_and_set_simt_(const_cast(ptr), memorder); -} -template -void __device__ __atomic_clear_simt(volatile void *ptr, int memorder) { - return __atomic_clear_simt_(const_cast(ptr), memorder); -} - -} // end namespace Impl -} // end namespace Kokkos - -#endif //_SIMT_DETAILS_CONFIG - -#ifndef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED -/* - builtins -*/ - -#define __atomic_load_n __atomic_load_n_simt -#define __atomic_load __atomic_load_simt -#define __atomic_store_n __atomic_store_n_simt -#define __atomic_store __atomic_store_simt -#define __atomic_exchange_n __atomic_exchange_n_simt -#define __atomic_exchange __atomic_exchange_simt -#define __atomic_compare_exchange_n __atomic_compare_exchange_n_simt -#define __atomic_compare_exchange __atomic_compare_exchange_simt -#define __atomic_fetch_add __atomic_fetch_add_simt -#define __atomic_fetch_sub __atomic_fetch_sub_simt -#define __atomic_fetch_and __atomic_fetch_and_simt -#define __atomic_fetch_xor __atomic_fetch_xor_simt -#define __atomic_fetch_or __atomic_fetch_or_simt -#define __atomic_test_and_set __atomic_test_and_set_simt -#define __atomic_clear __atomic_clear_simt -#define __atomic_always_lock_free __atomic_always_lock_free_simt -#define __atomic_is_lock_free __atomic_is_lock_free_simt -#define __atomic_thread_fence __atomic_thread_fence_simt -#define __atomic_signal_fence __atomic_signal_fence_simt - -#define KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED - -#endif //__CUDA_ARCH__ && KOKKOS_ENABLE_CUDA_ASM_ATOMICS -#endif // KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED diff --git a/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp b/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp deleted file mode 100644 index a7dfc15d7a..0000000000 --- a/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp +++ /dev/null @@ -1,41 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifdef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED - -#undef __atomic_load_n -#undef __atomic_load -#undef __atomic_store_n -#undef __atomic_store -#undef __atomic_exchange_n -#undef __atomic_exchange -#undef __atomic_compare_exchange_n -#undef __atomic_compare_exchange -#undef __atomic_fetch_add -#undef __atomic_fetch_sub -#undef __atomic_fetch_and -#undef __atomic_fetch_xor -#undef __atomic_fetch_or -#undef __atomic_test_and_set -#undef __atomic_clear -#undef __atomic_always_lock_free -#undef __atomic_is_lock_free -#undef __atomic_thread_fence -#undef __atomic_signal_fence - -#undef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED - -#endif // KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED diff --git a/core/src/HIP/Kokkos_HIP_Atomic.hpp b/core/src/HIP/Kokkos_HIP_Atomic.hpp deleted file mode 100644 index 49f89ed332..0000000000 --- a/core/src/HIP/Kokkos_HIP_Atomic.hpp +++ /dev/null @@ -1,590 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_HIP_ATOMIC_HPP -#define KOKKOS_HIP_ATOMIC_HPP - -#include -#include -#include - -#if defined(KOKKOS_ENABLE_HIP_ATOMICS) -namespace Kokkos { -// HIP can do: -// Types int/unsigned int -// variants: -// atomic_exchange/compare_exchange/fetch_add/fetch_sub/fetch_max/fetch_min/fetch_and/fetch_or/fetch_xor/fetch_inc/fetch_dec - -// atomic_exchange ------------------------------------------------------------- - -__inline__ __device__ int atomic_exchange(volatile int *const dest, - const int val) { - return atomicExch(const_cast(dest), val); -} - -__inline__ __device__ unsigned int atomic_exchange( - volatile unsigned int *const dest, const unsigned int val) { - return atomicExch(const_cast(dest), val); -} - -__inline__ __device__ unsigned long long int atomic_exchange( - volatile unsigned long long int *const dest, - const unsigned long long int val) { - return atomicExch(const_cast(dest), val); -} - -__inline__ __device__ float atomic_exchange(volatile float *const dest, - const float val) { - return atomicExch(const_cast(dest), val); -} - -template -__inline__ __device__ T -atomic_exchange(volatile T *const dest, - std::enable_if_t val) { - int tmp = atomicExch(reinterpret_cast(const_cast(dest)), - *reinterpret_cast(const_cast(&val))); - return reinterpret_cast(tmp); -} - -template -__inline__ __device__ T atomic_exchange( - volatile T *const dest, - std::enable_if_t - val) { - using type = unsigned long long int; - - type tmp = atomicExch(reinterpret_cast(const_cast(dest)), - *reinterpret_cast(const_cast(&val))); - return reinterpret_cast(tmp); -} - -template -__inline__ __device__ T atomic_exchange( - volatile T *const dest, - std::enable_if_t &val) { - T return_val; - int done = 0; - unsigned int active = __ballot(1); - unsigned int done_active = 0; - while (active != done_active) { - if (!done) { - if (Impl::lock_address_hip_space((void *)dest)) { - return_val = *dest; - *dest = val; - Impl::unlock_address_hip_space((void *)dest); - done = 1; - } - } - done_active = __ballot(done); - } - return return_val; -} - -// atomic_assign --------------------------------------------------------------- - -template -__inline__ __device__ void atomic_assign( - volatile T *const dest, - std::enable_if_t val) { - atomicExch(reinterpret_cast(const_cast(dest)), - *reinterpret_cast(const_cast(&val))); -} - -template -__inline__ __device__ void atomic_assign( - volatile T *const dest, - std::enable_if_t - val) { - using type = unsigned long long int; - atomicExch(reinterpret_cast(const_cast(dest)), - *reinterpret_cast(const_cast(&val))); -} - -template -__inline__ __device__ void atomic_assign( - volatile T *const dest, - std::enable_if_t - val) { - atomic_exchange(dest, val); -} - -// atomic_compare_exchange ----------------------------------------------------- - -inline __device__ int atomic_compare_exchange(volatile int *dest, int compare, - const int &val) { - return atomicCAS(const_cast(dest), compare, val); -} - -inline __device__ unsigned int atomic_compare_exchange( - volatile unsigned int *dest, unsigned int compare, - const unsigned int &val) { - return atomicCAS(const_cast(dest), compare, val); -} - -inline __device__ unsigned long long int atomic_compare_exchange( - volatile unsigned long long int *dest, unsigned long long int compare, - const unsigned long long int &val) { - return atomicCAS(const_cast(dest), compare, val); -} - -template -__inline__ __device__ T atomic_compare_exchange( - volatile T *dest, T compare, - std::enable_if_t val) { - // FIXME_HIP UB - union U { - int i; - T f; - __inline__ __device__ U() {} - } idest, icompare, ival; - icompare.f = compare; - ival.f = val; - idest.i = atomicCAS(reinterpret_cast(const_cast(dest)), - icompare.i, ival.i); - return idest.f; -} - -template -__inline__ __device__ T atomic_compare_exchange( - volatile T *dest, T compare, - std::enable_if_t - val) { - // FIXME_HIP UB - union U { - unsigned long long int i; - T f; - __inline__ __device__ U() {} - } idest, icompare, ival; - icompare.f = compare; - ival.f = val; - idest.i = atomicCAS( - reinterpret_cast(const_cast(dest)), - icompare.i, ival.i); - return idest.f; -} - -template -__inline__ __device__ T atomic_compare_exchange( - volatile T *const dest, const T &compare, - std::enable_if_t &val) { - T return_val; - int done = 0; - unsigned int active = __ballot(1); - unsigned int done_active = 0; - while (active != done_active) { - if (!done) { - if (Impl::lock_address_hip_space((void *)dest)) { - return_val = *dest; - if (return_val == compare) *dest = val; - Impl::unlock_address_hip_space((void *)dest); - done = 1; - } - } - done_active = __ballot(done); - } - return return_val; -} - -// atomic_fetch_add ------------------------------------------------------------ - -inline __device__ int atomic_fetch_add(volatile int *dest, const int &val) { - return atomicAdd(const_cast(dest), val); -} - -inline __device__ unsigned int atomic_fetch_add(volatile unsigned int *dest, - const unsigned int &val) { - return atomicAdd(const_cast(dest), val); -} - -inline __device__ unsigned long long atomic_fetch_add( - volatile unsigned long long *dest, const unsigned long long &val) { - return atomicAdd(const_cast(dest), val); -} - -inline __device__ float atomic_fetch_add(volatile float *dest, - const float &val) { - return atomicAdd(const_cast(dest), val); -} - -template -inline __device__ T -atomic_fetch_add(volatile T *const dest, - std::enable_if_t val) { - // FIXME_HIP UB - union U { - int i; - T t; - __inline__ __device__ U() {} - } assume, oldval, newval; - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t + val; - oldval.i = atomicCAS(reinterpret_cast(const_cast(dest)), - assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -template -inline __device__ T atomic_fetch_add( - volatile T *const dest, - std::enable_if_t val) { - // FIXME_HIP UB - union U { - unsigned long long i; - T t; - __inline__ __device__ U() {} - } assume, oldval, newval; - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t + val; - oldval.i = atomic_compare_exchange( - reinterpret_cast(dest), assume.i, - newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -__inline__ __device__ char atomic_fetch_add(volatile char *dest, - const char &val) { - unsigned int oldval, newval, assume; - oldval = *reinterpret_cast(&dest); - - do { - assume = oldval; - newval = assume & 0x7fffff00 + ((assume & 0xff) + val) & 0xff; - oldval = - atomicCAS(reinterpret_cast(const_cast(dest)), - assume, newval); - } while (assume != oldval); - - return oldval; -} - -__inline__ __device__ short atomic_fetch_add(volatile short *dest, - const short &val) { - unsigned int oldval, newval, assume; - oldval = *reinterpret_cast(&dest); - - do { - assume = oldval; - newval = assume & 0x7fff0000 + ((assume & 0xffff) + val) & 0xffff; - oldval = - atomicCAS(reinterpret_cast(const_cast(dest)), - assume, newval); - } while (assume != oldval); - - return oldval; -} - -__inline__ __device__ long long atomic_fetch_add(volatile long long *dest, - const long long &val) { - return atomicAdd( - reinterpret_cast(const_cast(dest)), - val); -} - -template -__inline__ __device__ T atomic_fetch_add( - volatile T *dest, - std::enable_if_t - val) { - T return_val; - int done = 0; - unsigned int active = __ballot(1); - unsigned int done_active = 0; - while (active != done_active) { - if (!done) { - if (Kokkos::Impl::lock_address_hip_space((void *)dest)) { - return_val = *dest; - *dest = return_val + val; - Kokkos::Impl::unlock_address_hip_space((void *)dest); - done = 1; - } - } - done_active = __ballot(done); - } - return return_val; -} - -// atmic_fetch_sub ------------------------------------------------------------- - -__inline__ __device__ int atomic_fetch_sub(volatile int *dest, int const &val) { - return atomicSub(const_cast(dest), val); -} - -__inline__ __device__ unsigned int atomic_fetch_sub(volatile unsigned int *dest, - unsigned int const &val) { - return atomicSub(const_cast(dest), val); -} - -__inline__ __device__ unsigned long long atomic_fetch_sub( - unsigned long long *dest, int64_t const &val) { - return atomicAdd(reinterpret_cast(dest), - -reinterpret_cast(val)); -} - -__inline__ __device__ char atomic_fetch_sub(volatile char *dest, - const char &val) { - unsigned int oldval, newval, assume; - oldval = *reinterpret_cast(dest); - - do { - assume = oldval; - newval = assume & 0x7fffff00 + ((assume & 0xff) - val) & 0xff; - oldval = - atomicCAS(reinterpret_cast(const_cast(dest)), - assume, newval); - } while (assume != oldval); - - return oldval; -} - -__inline__ __device__ short atomic_fetch_sub(volatile short *dest, - const short &val) { - unsigned int oldval, newval, assume; - oldval = *reinterpret_cast(dest); - - do { - assume = oldval; - newval = assume & 0x7fff0000 + ((assume & 0xffff) - val) & 0xffff; - oldval = - atomicCAS(reinterpret_cast(const_cast(dest)), - assume, newval); - } while (assume != oldval); - - return oldval; -} - -__inline__ __device__ long long atomic_fetch_sub(volatile long long *dest, - const long long &val) { - return static_cast(atomicAdd( - reinterpret_cast(const_cast(dest)), - -reinterpret_cast(val))); -} - -template -__inline__ __device__ T atomic_fetch_sub( - volatile T *dest, std::enable_if_t val) { - // FIXME_HIP UB - union U { - int i; - T t; - __inline__ __device__ U() {} - } assume, oldval, newval; - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t - val; - oldval.i = atomic_compare_exchange(reinterpret_cast(dest), - assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -template -inline __device__ T atomic_fetch_sub( - volatile T *const dest, - std::enable_if_t val) { - // FIXME_HIP UB - union U { - unsigned long long i; - T t; - __inline__ __device__ U() {} - } assume, oldval, newval; - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t - val; - oldval.i = atomic_compare_exchange( - reinterpret_cast(dest), assume.i, - newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -template -__inline__ __device__ T atomic_fetch_sub( - volatile T *dest, std::enable_if_t val) { - unsigned int oldval, newval, assume; - oldval = *reinterpret_cast(dest); - - do { - assume = oldval; - newval = assume & 0x7fffff00 + ((assume & 0xff) - val) & 0xff; - oldval = atomicCAS(reinterpret_cast(dest), assume, newval); - } while (assume != oldval); - - return reinterpret_cast(oldval) & 0xff; -} - -template -__inline__ __device__ T atomic_fetch_sub( - volatile T *dest, std::enable_if_t val) { - unsigned int oldval, newval, assume; - oldval = *reinterpret_cast(dest); - - do { - assume = oldval; - newval = assume & 0x7fff0000 + ((assume & 0xffff) - val) & 0xffff; - oldval = atomicCAS(reinterpret_cast(dest), assume, newval); - } while (assume != oldval); - - return reinterpret_cast(oldval) & 0xffff; -} - -template -__inline__ __device__ T atomic_fetch_sub( - volatile T *const dest, - std::enable_if_t &val) { - T return_val; - int done = 0; - unsigned int active = __ballot(1); - unsigned int done_active = 0; - while (active != done_active) { - if (!done) { - if (Impl::lock_address_hip_space((void *)dest)) { - return_val = *dest; - *dest = return_val - val; - Impl::unlock_address_hip_space((void *)dest); - done = 1; - } - } - done_active = __ballot(done); - } - return return_val; -} - -// atomic_fetch_or ------------------------------------------------------------- - -__inline__ __device__ int atomic_fetch_or(volatile int *const dest, - int const val) { - return atomicOr(const_cast(dest), val); -} - -__inline__ __device__ unsigned int atomic_fetch_or( - volatile unsigned int *const dest, unsigned int const val) { - return atomicOr(const_cast(dest), val); -} - -__inline__ __device__ unsigned long long int atomic_fetch_or( - volatile unsigned long long int *const dest, - unsigned long long int const val) { - return atomicOr(const_cast(dest), val); -} - -// atomic_fetch_and ------------------------------------------------------------ - -__inline__ __device__ int atomic_fetch_and(volatile int *const dest, - int const val) { - return atomicAnd(const_cast(dest), val); -} - -__inline__ __device__ unsigned int atomic_fetch_and( - volatile unsigned int *const dest, unsigned int const val) { - return atomicAnd(const_cast(dest), val); -} - -__inline__ __device__ unsigned long long int atomic_fetch_and( - volatile unsigned long long int *const dest, - unsigned long long int const val) { - return atomicAnd(const_cast(dest), val); -} - -namespace Impl { - -template -__inline__ __device__ void _atomic_store(T *ptr, T val, - memory_order_relaxed_t) { - (void)atomic_exchange(ptr, val); -} - -template -__inline__ __device__ void _atomic_store(T *ptr, T val, - memory_order_seq_cst_t) { - memory_fence(); - atomic_store(ptr, val, memory_order_relaxed); - memory_fence(); -} - -template -__inline__ __device__ void _atomic_store(T *ptr, T val, - memory_order_release_t) { - memory_fence(); - atomic_store(ptr, val, memory_order_relaxed); -} - -template -__inline__ __device__ void _atomic_store(T *ptr, T val) { - atomic_store(ptr, val, memory_order_relaxed); -} - -template -__inline__ __device__ T _atomic_load(T *ptr, memory_order_relaxed_t) { - T dummy{}; - return atomic_compare_exchange(ptr, dummy, dummy); -} - -template -__inline__ __device__ T _atomic_load(T *ptr, memory_order_seq_cst_t) { - memory_fence(); - T rv = atomic_load(ptr, memory_order_relaxed); - memory_fence(); - return rv; -} - -template -__inline__ __device__ T _atomic_load(T *ptr, memory_order_acquire_t) { - T rv = atomic_load(ptr, memory_order_relaxed); - memory_fence(); - return rv; -} - -template -__inline__ __device__ T _atomic_load(T *ptr) { - return atomic_load(ptr, memory_order_relaxed); -} - -} // namespace Impl -} // namespace Kokkos -#endif - -#endif diff --git a/core/src/impl/Kokkos_Atomic_Assembly.hpp b/core/src/impl/Kokkos_Atomic_Assembly.hpp deleted file mode 100644 index 59d70e7f7c..0000000000 --- a/core/src/impl/Kokkos_Atomic_Assembly.hpp +++ /dev/null @@ -1,79 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_ASSEMBLY_HPP) -#define KOKKOS_ATOMIC_ASSEMBLY_HPP -namespace Kokkos { - -namespace Impl { - -#if !defined(_WIN32) -struct cas128_t { - uint64_t lower; - uint64_t upper; - - KOKKOS_INLINE_FUNCTION - cas128_t() { - lower = 0; - upper = 0; - } - - KOKKOS_INLINE_FUNCTION - cas128_t(const cas128_t& a) { - lower = a.lower; - upper = a.upper; - } - KOKKOS_INLINE_FUNCTION - cas128_t(volatile cas128_t* a) { - lower = a->lower; - upper = a->upper; - } - - KOKKOS_INLINE_FUNCTION - bool operator!=(const cas128_t& a) const { - return (lower != a.lower) || upper != a.upper; - } - - KOKKOS_INLINE_FUNCTION - void operator=(const cas128_t& a) { - lower = a.lower; - upper = a.upper; - } - KOKKOS_INLINE_FUNCTION - void operator=(const cas128_t& a) volatile { - lower = a.lower; - upper = a.upper; - } -} __attribute__((__aligned__(16))); -#endif - -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) -inline cas128_t cas128(volatile cas128_t* ptr, cas128_t cmp, cas128_t swap) { - bool swapped = false; - __asm__ __volatile__( - "lock cmpxchg16b %1\n\t" - "setz %0" - : "=q"(swapped), "+m"(*ptr), "+d"(cmp.upper), "+a"(cmp.lower) - : "c"(swap.upper), "b"(swap.lower), "q"(swapped)); - return cmp; -} -#endif - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp deleted file mode 100644 index 08091ab9ce..0000000000 --- a/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp +++ /dev/null @@ -1,409 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) -#include -#endif - -#include -#if defined(KOKKOS_ATOMIC_HPP) && \ - !defined(KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP) -#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP - -#include -#include - -#if defined(KOKKOS_ENABLE_CUDA) -#include -#endif - -namespace Kokkos { - -//---------------------------------------------------------------------------- -// Cuda native CAS supports int, unsigned int, and unsigned long long int -// (non-standard type). Must cast-away 'volatile' for the CAS call. - -#if defined(KOKKOS_ENABLE_CUDA) - -#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) -__inline__ __device__ int atomic_compare_exchange(volatile int* const dest, - const int compare, - const int val) { - return atomicCAS((int*)dest, compare, val); -} - -__inline__ __device__ unsigned int atomic_compare_exchange( - volatile unsigned int* const dest, const unsigned int compare, - const unsigned int val) { - return atomicCAS((unsigned int*)dest, compare, val); -} - -__inline__ __device__ unsigned long long int atomic_compare_exchange( - volatile unsigned long long int* const dest, - const unsigned long long int compare, const unsigned long long int val) { - return atomicCAS((unsigned long long int*)dest, compare, val); -} - -template -__inline__ __device__ T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t val) { - const int tmp = atomicCAS((int*)dest, *((int*)&compare), *((int*)&val)); - return *((T*)&tmp); -} - -template -__inline__ __device__ T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t - val) { - using type = unsigned long long int; - const type tmp = atomicCAS((type*)dest, *((type*)&compare), *((type*)&val)); - return *((T*)&tmp); -} - -template -__inline__ __device__ T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) { - T return_val; - // This is a way to (hopefully) avoid dead lock in a warp - int done = 0; - unsigned int mask = __activemask(); - unsigned int active = __ballot_sync(mask, 1); - unsigned int done_active = 0; - while (active != done_active) { - if (!done) { - if (Impl::lock_address_cuda_space((void*)dest)) { - Kokkos::memory_fence(); - return_val = *dest; - if (return_val == compare) *dest = val; - Kokkos::memory_fence(); - Impl::unlock_address_cuda_space((void*)dest); - done = 1; - } - } - done_active = __ballot_sync(mask, done); - } - return return_val; -} -#endif -#endif - -//---------------------------------------------------------------------------- -// GCC native CAS supports int, long, unsigned int, unsigned long. -// Intel native CAS support int and long with the same interface as GCC. -#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) -#if defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) -// atomic_compare_exchange are already defined in Kokkos_Atomic_Windows.hpp -#elif defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) - -inline int atomic_compare_exchange(volatile int* const dest, const int compare, - const int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_val_compare_and_swap(dest, compare, val); -} - -inline long atomic_compare_exchange(volatile long* const dest, - const long compare, const long val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_val_compare_and_swap(dest, compare, val); -} - -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) - -// GCC supports unsigned - -inline unsigned int atomic_compare_exchange(volatile unsigned int* const dest, - const unsigned int compare, - const unsigned int val) { - return __sync_val_compare_and_swap(dest, compare, val); -} - -inline unsigned long atomic_compare_exchange(volatile unsigned long* const dest, - const unsigned long compare, - const unsigned long val) { - return __sync_val_compare_and_swap(dest, compare, val); -} - -inline unsigned long long atomic_compare_exchange( - volatile unsigned long long* const dest, const unsigned long long compare, - const unsigned long long val) { - return __sync_val_compare_and_swap(dest, compare, val); -} - -#endif - -template -inline T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t val) { - union U { - int i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } tmp; - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - tmp.i = - __sync_val_compare_and_swap((int*)dest, *((int*)&compare), *((int*)&val)); - return tmp.t; -} - -template -inline T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t - val) { - union U { - long i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } tmp; - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - tmp.i = __sync_val_compare_and_swap((long*)dest, *((long*)&compare), - *((long*)&val)); - return tmp.t; -} - -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) -template -inline T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t - val) { - union U { - Impl::cas128_t i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } tmp; - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - tmp.i = Impl::cas128((Impl::cas128_t*)dest, *((Impl::cas128_t*)&compare), - *((Impl::cas128_t*)&val)); - return tmp.t; -} -#endif - -template -inline T atomic_compare_exchange( - volatile T* const dest, const T compare, - std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8) -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - && (sizeof(T) != 16) -#endif - , - const T>& val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - while (!Impl::lock_address_host_space((void*)dest)) - ; - Kokkos::memory_fence(); - T return_val = *dest; - if (return_val == compare) { - // Don't use the following line of code here: - // - // const T tmp = *dest = val; - // - // Instead, put each assignment in its own statement. This is - // because the overload of T::operator= for volatile *this should - // return void, not volatile T&. See Kokkos #177: - // - // https://github.com/kokkos/kokkos/issues/177 - *dest = val; - const T tmp = *dest; -#ifndef KOKKOS_COMPILER_CLANG - (void)tmp; -#endif - Kokkos::memory_fence(); - } - Impl::unlock_address_host_space((void*)dest); - return return_val; -} -//---------------------------------------------------------------------------- - -#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) - -template -KOKKOS_INLINE_FUNCTION T atomic_compare_exchange(volatile T* const dest, - const T compare, const T val) { - T retval; -#pragma omp critical - { - retval = dest[0]; - if (retval == compare) dest[0] = val; - } - return retval; -} - -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - -template -KOKKOS_INLINE_FUNCTION T atomic_compare_exchange(volatile T* const dest_v, - const T compare, const T val) { - T* dest = const_cast(dest_v); - T retval = *dest; - if (retval == compare) *dest = val; - return retval; -} - -#endif -#endif - -// dummy for non-CUDA Kokkos headers being processed by NVCC -#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) -template -__inline__ __device__ T atomic_compare_exchange( - volatile T* const, const Kokkos::Impl::type_identity_t, - const Kokkos::Impl::type_identity_t) { - return T(); -} -#endif - -template -KOKKOS_INLINE_FUNCTION bool atomic_compare_exchange_strong( - volatile T* const dest, const T compare, const T val) { - return compare == atomic_compare_exchange(dest, compare, val); -} -//---------------------------------------------------------------------------- - -namespace Impl { -// memory-ordered versions are in the Impl namespace - -template -KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback( - T* dest, T compare, T val, memory_order_seq_cst_t, MemoryOrderFailure) { - Kokkos::memory_fence(); - auto rv = Kokkos::atomic_compare_exchange_strong(dest, compare, val); - Kokkos::memory_fence(); - return rv; -} - -template -KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback( - T* dest, T compare, T val, memory_order_acquire_t, MemoryOrderFailure) { - auto rv = Kokkos::atomic_compare_exchange_strong(dest, compare, val); - Kokkos::memory_fence(); - return rv; -} - -template -KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback( - T* dest, T compare, T val, memory_order_release_t, MemoryOrderFailure) { - Kokkos::memory_fence(); - return Kokkos::atomic_compare_exchange_strong(dest, compare, val); -} - -template -KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback( - T* dest, T compare, T val, memory_order_relaxed_t, MemoryOrderFailure) { - return Kokkos::atomic_compare_exchange_strong(dest, compare, val); -} - -#if (defined(KOKKOS_ENABLE_GNU_ATOMICS) && !defined(__CUDA_ARCH__)) || \ - (defined(KOKKOS_ENABLE_INTEL_ATOMICS) && !defined(__CUDA_ARCH__)) || \ - defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) - -#if defined(__CUDA_ARCH__) -#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH __inline__ __device__ -#else -#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH inline -#endif - -template -KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH bool _atomic_compare_exchange_strong( - T* dest, T compare, T val, MemoryOrderSuccess, MemoryOrderFailure, - std::enable_if_t< - (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8 || - sizeof(T) == 16) && - std::is_same>::value && - std::is_same>::value, - void const**> = nullptr) { - return __atomic_compare_exchange_n(dest, &compare, val, /* weak = */ false, - MemoryOrderSuccess::gnu_constant, - MemoryOrderFailure::gnu_constant); -} - -template -KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH bool _atomic_compare_exchange_strong( - T* dest, T compare, T val, MemoryOrderSuccess order_success, - MemoryOrderFailure order_failure, - std::enable_if_t< - !(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || - sizeof(T) == 8 || sizeof(T) == 16) && - std::is_same>::value && - std::is_same>::value, - void const**> = nullptr) { - return _atomic_compare_exchange_fallback(dest, compare, val, order_success, - order_failure); -} - -#else - -template -KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong( - T* dest, T compare, T val, MemoryOrderSuccess order_success, - MemoryOrderFailure order_failure) { - return _atomic_compare_exchange_strong_fallback(dest, compare, val, - order_success, order_failure); -} - -#endif - -// TODO static asserts in overloads that don't make sense (as listed in -// https://gcc.gnu.org/onlinedocs/gcc-5.2.0/gcc/_005f_005fatomic-Builtins.html) -template -KOKKOS_FORCEINLINE_FUNCTION bool atomic_compare_exchange_strong( - T* dest, T compare, T val, MemoryOrderSuccess order_success, - MemoryOrderFailure order_failure) { - return _atomic_compare_exchange_strong(dest, compare, val, order_success, - order_failure); -} - -} // end namespace Impl - -} // namespace Kokkos - -#if defined(KOKKOS_ENABLE_CUDA) -#include -#endif - -#endif diff --git a/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp b/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp deleted file mode 100644 index 8849277836..0000000000 --- a/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp +++ /dev/null @@ -1,380 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) -#include -#endif - -#include -#include -#ifndef KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP -#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP - -namespace Kokkos { - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -// Cuda sm_70 or greater supports C++-like semantics directly - -#if defined(KOKKOS_ENABLE_CUDA) - -#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) - -#if __CUDA_ARCH__ >= 700 -// See: https://github.com/ogiroux/freestanding -#define kokkos_cuda_internal_cas_release_32(ptr, old, expected, desired) \ - asm volatile("atom.cas.release.sys.b32 %0, [%1], %2, %3;" \ - : "=r"(old) \ - : "l"(ptr), "r"(expected), "r"(desired) \ - : "memory") -#define kokkos_cuda_internal_cas_acquire_32(ptr, old, expected, desired) \ - asm volatile("atom.cas.acquire.sys.b32 %0, [%1], %2, %3;" \ - : "=r"(old) \ - : "l"(ptr), "r"(expected), "r"(desired) \ - : "memory") -#define kokkos_cuda_internal_cas_acq_rel_32(ptr, old, expected, desired) \ - asm volatile("atom.cas.acq_rel.sys.b32 %0, [%1], %2, %3;" \ - : "=r"(old) \ - : "l"(ptr), "r"(expected), "r"(desired) \ - : "memory") -#define kokkos_cuda_internal_cas_relaxed_32(ptr, old, expected, desired) \ - asm volatile("atom.cas.relaxed.sys.b32 %0, [%1], %2, %3;" \ - : "=r"(old) \ - : "l"(ptr), "r"(expected), "r"(desired) \ - : "memory") -#define kokkos_cuda_internal_fence_seq_cst() \ - asm volatile("fence.sc.sys;" : : : "memory") -#define kokkos_cuda_internal_fence_acq_rel() \ - asm volatile("fence.acq_rel.sys;" : : : "memory") -#else -#define kokkos_cuda_internal_fence_acq_rel() \ - asm volatile("membar.sys;" : : : "memory") -#define kokkos_cuda_internal_fence_seq_cst() \ - asm volatile("membar.sys;" : : : "memory") -#endif - -// 32-bit version -template = 0> -__inline__ __device__ bool atomic_compare_exchange_weak( - T volatile* const dest, T* const expected, T const desired, - std::memory_order success_order = std::memory_order_seq_cst, - std::memory_order failure_order = std::memory_order_seq_cst) { - // TODO assert that success_order >= failure_order - // See: https://github.com/ogiroux/freestanding - int32_t tmp = 0; - int32_t old = 0; - memcpy(&tmp, &desired, sizeof(T)); - memcpy(&old, expected, sizeof(T)); - int32_t old_tmp = old; -#if __CUDA_ARCH__ >= 700 - switch (success_order) { - case std::memory_order_seq_cst: - // sequentially consistent is just an acquire with a seq_cst fence - kokkos_cuda_internal_fence_seq_cst(); - kokkos_cuda_internal_cas_acquire_32((T*)dest, old, old_tmp, tmp); - break; - case std::memory_order_acquire: - kokkos_cuda_internal_cas_acquire_32((T*)dest, old, old_tmp, tmp); - break; - case std::memory_order_consume: - // same as acquire on PTX compatible platforms - kokkos_cuda_internal_cas_acquire_32((T*)dest, old, old_tmp, tmp); - break; - case std::memory_order_acq_rel: - kokkos_cuda_internal_cas_acq_rel_32((T*)dest, old, old_tmp, tmp); - break; - case std::memory_order_release: - kokkos_cuda_internal_cas_release_32((T*)dest, old, old_tmp, tmp); - break; - case std::memory_order_relaxed: - kokkos_cuda_internal_cas_relaxed_32((T*)dest, old, old_tmp, tmp); - break; - }; -#else - // All of the orders that require a fence before the relaxed atomic operation: - if (success_order == std::memory_order_release || - success_order == std::memory_order_acq_rel) { - kokkos_cuda_internal_fence_acq_rel(); - } else if (success_order == std::memory_order_seq_cst) { - kokkos_cuda_internal_fence_seq_cst(); - } - // This is relaxed: - // Cuda API requires casting away volatile - atomicCAS((T*)dest, old_tmp, tmp); -#endif - bool const rv = (old == old_tmp); -#if __CUDA_ARCH__ < 700 - if (rv) { - if (success_order == std::memory_order_acquire || - success_order == std::memory_order_consume || - success_order == std::memory_order_acq_rel) { - kokkos_cuda_internal_fence_acq_rel(); - } else if (success_order == std::memory_order_seq_cst) { - kokkos_cuda_internal_fence_seq_cst(); - } - } else { - if (failure_order == std::memory_order_acquire || - failure_order == std::memory_order_consume || - failure_order == std::memory_order_acq_rel) { - kokkos_cuda_internal_fence_acq_rel(); - } else if (failure_order == std::memory_order_seq_cst) { - kokkos_cuda_internal_fence_seq_cst(); - } - } -#endif - memcpy(expected, &old, sizeof(T)); - return rv; -} - -// 64-bit version -template = 0> -bool atomic_compare_exchange_weak( - T volatile* const dest, T* const expected, T const desired, - std::memory_order success_order = std::memory_order_seq_cst, - std::memory_order failure_order = std::memory_order_seq_cst) { - // TODO assert that success_order >= failure_order - // See: https://github.com/ogiroux/freestanding - int64_t tmp = 0; - int64_t old = 0; - memcpy(&tmp, &desired, sizeof(T)); - memcpy(&old, expected, sizeof(T)); - int64_t old_tmp = old; -#if __CUDA_ARCH__ >= 700 - switch (success_order) { - case std::memory_order_seq_cst: - // sequentially consistent is just an acquire with a seq_cst fence - kokkos_cuda_internal_fence_seq_cst(); - kokkos_cuda_internal_cas_acquire_64((T*)dest, old, old_tmp, tmp); - break; - case std::memory_order_acquire: - kokkos_cuda_internal_cas_acquire_64((T*)dest, old, old_tmp, tmp); - break; - case std::memory_order_consume: - // same as acquire on PTX compatible platforms - kokkos_cuda_internal_cas_acquire_64((T*)dest, old, old_tmp, tmp); - break; - case std::memory_order_acq_rel: - kokkos_cuda_internal_cas_acq_rel_64((T*)dest, old, old_tmp, tmp); - break; - case std::memory_order_release: - kokkos_cuda_internal_cas_release_64((T*)dest, old, old_tmp, tmp); - break; - case std::memory_order_relaxed: - kokkos_cuda_internal_cas_relaxed_64((T*)dest, old, old_tmp, tmp); - break; - }; -#else - // Cuda API requires casting away volatile - atomicCAS((T*)dest, old_tmp, tmp); -#endif - bool const rv = (old == old_tmp); - memcpy(expected, &old, sizeof(T)); - return rv; -} - -#endif // defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) - -#endif // defined( KOKKOS_ENABLE_CUDA ) - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -// GCC native CAS supports int, long, unsigned int, unsigned long. -// Intel native CAS support int and long with the same interface as GCC. -#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) - -inline int atomic_compare_exchange(volatile int* const dest, const int compare, - const int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_val_compare_and_swap(dest, compare, val); -} - -inline long atomic_compare_exchange(volatile long* const dest, - const long compare, const long val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_val_compare_and_swap(dest, compare, val); -} - -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) - -// GCC supports unsigned - -inline unsigned int atomic_compare_exchange(volatile unsigned int* const dest, - const unsigned int compare, - const unsigned int val) { - return __sync_val_compare_and_swap(dest, compare, val); -} - -inline unsigned long atomic_compare_exchange(volatile unsigned long* const dest, - const unsigned long compare, - const unsigned long val) { - return __sync_val_compare_and_swap(dest, compare, val); -} - -inline unsigned long long atomic_compare_exchange( - volatile unsigned long long* const dest, const unsigned long long compare, - const unsigned long long val) { - return __sync_val_compare_and_swap(dest, compare, val); -} - -#endif - -template -inline T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t val) { - union U { - int i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } tmp; - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - tmp.i = - __sync_val_compare_and_swap((int*)dest, *((int*)&compare), *((int*)&val)); - return tmp.t; -} - -template -inline T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t - val) { - union U { - long i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } tmp; - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - tmp.i = __sync_val_compare_and_swap((long*)dest, *((long*)&compare), - *((long*)&val)); - return tmp.t; -} - -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) -template -inline T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t - val) { - union U { - Impl::cas128_t i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } tmp; - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - tmp.i = Impl::cas128((Impl::cas128_t*)dest, *((Impl::cas128_t*)&compare), - *((Impl::cas128_t*)&val)); - return tmp.t; -} -#endif - -template -inline T atomic_compare_exchange( - volatile T* const dest, const T compare, - std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8) -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - && (sizeof(T) != 16) -#endif - , - const T>& val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - while (!Impl::lock_address_host_space((void*)dest)) - ; - Kokkos::memory_fence(); - T return_val = *dest; - if (return_val == compare) { - // Don't use the following line of code here: - // - // const T tmp = *dest = val; - // - // Instead, put each assignment in its own statement. This is - // because the overload of T::operator= for volatile *this should - // return void, not volatile T&. See Kokkos #177: - // - // https://github.com/kokkos/kokkos/issues/177 - *dest = val; - const T tmp = *dest; -#ifndef KOKKOS_COMPILER_CLANG - (void)tmp; -#endif - Kokkos::memory_fence(); - } - Impl::unlock_address_host_space((void*)dest); - return return_val; -} -//---------------------------------------------------------------------------- - -#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) - -template -KOKKOS_INLINE_FUNCTION T atomic_compare_exchange(volatile T* const dest, - const T compare, const T val) { - T retval; -#pragma omp critical - { - retval = dest[0]; - if (retval == compare) dest[0] = val; - } - return retval; -} - -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - -template -KOKKOS_INLINE_FUNCTION T atomic_compare_exchange(volatile T* const dest_v, - const T compare, const T val) { - T* dest = const_cast(dest_v); - T retval = *dest; - if (retval == compare) *dest = val; - return retval; -} - -#endif -#endif - -template -KOKKOS_INLINE_FUNCTION bool atomic_compare_exchange_strong( - volatile T* const dest, const T compare, const T val) { - return compare == atomic_compare_exchange(dest, compare, val); -} -//---------------------------------------------------------------------------- - -} // namespace Kokkos - -#endif diff --git a/core/src/impl/Kokkos_Atomic_Decrement.hpp b/core/src/impl/Kokkos_Atomic_Decrement.hpp deleted file mode 100644 index aac5233b3a..0000000000 --- a/core/src/impl/Kokkos_Atomic_Decrement.hpp +++ /dev/null @@ -1,119 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) -#include -#endif - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_DECREMENT_HPP) -#define KOKKOS_ATOMIC_DECREMENT_HPP - -#include "impl/Kokkos_Atomic_Fetch_Sub.hpp" - -namespace Kokkos { - -// Atomic decrement -template <> -KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile char* a) { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ - !defined(_WIN32) && !defined(__CUDA_ARCH__) -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)a, _MM_HINT_ET0); -#endif - __asm__ __volatile__("lock decb %0" - : /* no output registers */ - : "m"(a[0]) - : "memory"); -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - char* a_nv = const_cast(a); - --(*a_nv); -#else - Kokkos::atomic_fetch_sub(a, char(1)); -#endif -} - -template <> -KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile short* a) { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ - !defined(_WIN32) && !defined(__CUDA_ARCH__) -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)a, _MM_HINT_ET0); -#endif - __asm__ __volatile__("lock decw %0" - : /* no output registers */ - : "m"(a[0]) - : "memory"); -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - short* a_nv = const_cast(a); - --(*a_nv); -#else - Kokkos::atomic_fetch_sub(a, short(1)); -#endif -} - -template <> -KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile int* a) { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ - !defined(_WIN32) && !defined(__CUDA_ARCH__) -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)a, _MM_HINT_ET0); -#endif - __asm__ __volatile__("lock decl %0" - : /* no output registers */ - : "m"(a[0]) - : "memory"); -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - int* a_nv = const_cast(a); - --(*a_nv); -#else - Kokkos::atomic_fetch_sub(a, int(1)); -#endif -} - -template <> -KOKKOS_INLINE_FUNCTION void atomic_decrement( - volatile long long int* a) { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ - !defined(_WIN32) && !defined(__CUDA_ARCH__) -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)a, _MM_HINT_ET0); -#endif - __asm__ __volatile__("lock decq %0" - : /* no output registers */ - : "m"(a[0]) - : "memory"); -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - long long int* a_nv = const_cast(a); - --(*a_nv); -#else - using T = long long int; - Kokkos::atomic_fetch_sub(a, T(1)); -#endif -} - -template -KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile T* a) { -#if defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - T* a_nv = const_cast(a); - --(*a_nv); -#else - Kokkos::atomic_fetch_sub(a, T(1)); -#endif -} - -} // End of namespace Kokkos -#endif diff --git a/core/src/impl/Kokkos_Atomic_Exchange.hpp b/core/src/impl/Kokkos_Atomic_Exchange.hpp deleted file mode 100644 index abfc1f631a..0000000000 --- a/core/src/impl/Kokkos_Atomic_Exchange.hpp +++ /dev/null @@ -1,376 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) -#include -#endif - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_EXCHANGE_HPP) -#define KOKKOS_ATOMIC_EXCHANGE_HPP - -namespace Kokkos { - -//---------------------------------------------------------------------------- - -#if defined(KOKKOS_ENABLE_CUDA) -#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) - -__inline__ __device__ int atomic_exchange(volatile int* const dest, - const int val) { - // return __iAtomicExch( (int*) dest , val ); - return atomicExch((int*)dest, val); -} - -__inline__ __device__ unsigned int atomic_exchange( - volatile unsigned int* const dest, const unsigned int val) { - // return __uAtomicExch( (unsigned int*) dest , val ); - return atomicExch((unsigned int*)dest, val); -} - -__inline__ __device__ unsigned long long int atomic_exchange( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - // return __ullAtomicExch( (unsigned long long*) dest , val ); - return atomicExch((unsigned long long*)dest, val); -} - -/** \brief Atomic exchange for any type with compatible size */ -template -__inline__ __device__ T -atomic_exchange(volatile T* const dest, - std::enable_if_t val) { - // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) ); -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - int tmp = atomicExch(((int*)dest), *((int*)&val)); - return *((T*)&tmp); -} - -template -__inline__ __device__ T atomic_exchange( - volatile T* const dest, - std::enable_if_t - val) { - using type = unsigned long long int; - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) ); - type tmp = atomicExch(((type*)dest), *((type*)&val)); - return *((T*)&tmp); -} - -template -__inline__ __device__ T atomic_exchange( - volatile T* const dest, - std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) { - T return_val; - // This is a way to (hopefully) avoid dead lock in a warp -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - int done = 0; - unsigned int mask = __activemask(); - unsigned int active = __ballot_sync(mask, 1); - unsigned int done_active = 0; - while (active != done_active) { - if (!done) { - if (Impl::lock_address_cuda_space((void*)dest)) { - Kokkos::memory_fence(); - return_val = *dest; - *dest = val; - Kokkos::memory_fence(); - Impl::unlock_address_cuda_space((void*)dest); - done = 1; - } - } - done_active = __ballot_sync(mask, done); - } - return return_val; -} -/** \brief Atomic exchange for any type with compatible size */ -template -__inline__ __device__ void atomic_assign( - volatile T* const dest, - std::enable_if_t val) { - // (void) __ullAtomicExch( (int*) dest , *((int*)&val) ); - (void)atomicExch(((int*)dest), *((int*)&val)); -} - -template -__inline__ __device__ void atomic_assign( - volatile T* const dest, - std::enable_if_t - val) { - using type = unsigned long long int; - // (void) __ullAtomicExch( (type*) dest , *((type*)&val) ); - (void)atomicExch(((type*)dest), *((type*)&val)); -} - -template -__inline__ __device__ void atomic_assign( - volatile T* const dest, - std::enable_if_t - val) { - (void)atomic_exchange(dest, val); -} - -#endif -#endif - -//---------------------------------------------------------------------------- - -#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) - -template -inline T atomic_exchange( - volatile T* const dest, - std::enable_if_t - val) { - using type = std::conditional_t; -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - const type v = *((type*)&val); // Extract to be sure the value doesn't change - - type assumed; - - union U { - T val_T; - type val_type; - inline U() {} - } old; - - old.val_T = *dest; - - do { - assumed = old.val_type; - old.val_type = - __sync_val_compare_and_swap((volatile type*)dest, assumed, v); - } while (assumed != old.val_type); - - return old.val_T; -} - -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) -template -inline T atomic_exchange( - volatile T* const dest, - std::enable_if_t val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - union U { - Impl::cas128_t i; - T t; - inline U() {} - } assume, oldval, newval; - - oldval.t = *dest; - newval.t = val; - - do { - assume.i = oldval.i; - oldval.i = Impl::cas128((volatile Impl::cas128_t*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} -#endif - -//---------------------------------------------------------------------------- - -template -inline T atomic_exchange(volatile T* const dest, - std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8) -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - && (sizeof(T) != 16) -#endif - , - const T>& val) { - while (!Impl::lock_address_host_space((void*)dest)) - ; - Kokkos::memory_fence(); - T return_val = *dest; - // Don't use the following line of code here: - // - // const T tmp = *dest = val; - // - // Instead, put each assignment in its own statement. This is - // because the overload of T::operator= for volatile *this should - // return void, not volatile T&. See Kokkos #177: - // - // https://github.com/kokkos/kokkos/issues/177 - *dest = val; - const T tmp = *dest; -#ifndef KOKKOS_COMPILER_CLANG - (void)tmp; -#endif - Kokkos::memory_fence(); - Impl::unlock_address_host_space((void*)dest); - return return_val; -} - -template -inline void atomic_assign( - volatile T* const dest, - std::enable_if_t - val) { - using type = std::conditional_t; - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - const type v = *((type*)&val); // Extract to be sure the value doesn't change - - type assumed; - - union U { - T val_T; - type val_type; - inline U() {} - } old; - - old.val_T = *dest; - - do { - assumed = old.val_type; - old.val_type = - __sync_val_compare_and_swap((volatile type*)dest, assumed, v); - } while (assumed != old.val_type); -} - -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) -template -inline void atomic_assign( - volatile T* const dest, - std::enable_if_t val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - union U { - Impl::cas128_t i; - T t; - inline U() {} - } assume, oldval, newval; - - oldval.t = *dest; - newval.t = val; - do { - assume.i = oldval.i; - oldval.i = Impl::cas128((volatile Impl::cas128_t*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); -} -#endif - -template -inline void atomic_assign(volatile T* const dest, - std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8) -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - && (sizeof(T) != 16) -#endif - , - const T>& val) { - while (!Impl::lock_address_host_space((void*)dest)) - ; - Kokkos::memory_fence(); - // This is likely an aggregate type with a defined - // 'volatile T & operator = ( const T & ) volatile' - // member. The volatile return value implicitly defines a - // dereference that some compilers (gcc 4.7.2) warn is being ignored. - // Suppress warning by casting return to void. - //(void)( *dest = val ); - *dest = val; - Kokkos::memory_fence(); - Impl::unlock_address_host_space((void*)dest); -} -//---------------------------------------------------------------------------- - -#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) - -template -inline T atomic_exchange(volatile T* const dest, const T val) { - T retval; - //#pragma omp atomic capture -#pragma omp critical - { - retval = dest[0]; - dest[0] = val; - } - return retval; -} - -template -inline void atomic_assign(volatile T* const dest, const T val) { - //#pragma omp atomic -#pragma omp critical - { dest[0] = val; } -} - -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - -template -inline T atomic_exchange(volatile T* const dest_v, const T val) { - T* dest = const_cast(dest_v); - T retval = *dest; - *dest = val; - return retval; -} - -template -inline void atomic_assign(volatile T* const dest_v, const T val) { - T* dest = const_cast(dest_v); - *dest = val; -} - -#endif -#endif - -// dummy for non-CUDA Kokkos headers being processed by NVCC -#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) -template -__inline__ __device__ T -atomic_exchange(volatile T* const, const Kokkos::Impl::type_identity_t) { - return T(); -} - -template -__inline__ __device__ void atomic_assign( - volatile T* const, const Kokkos::Impl::type_identity_t) {} -#endif - -} // namespace Kokkos - -#endif diff --git a/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp deleted file mode 100644 index a8c421fbdb..0000000000 --- a/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp +++ /dev/null @@ -1,360 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) -#include -#endif - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_ADD_HPP) -#define KOKKOS_ATOMIC_FETCH_ADD_HPP - -namespace Kokkos { - -//---------------------------------------------------------------------------- - -#if defined(KOKKOS_ENABLE_CUDA) -#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) - -// Support for int, unsigned int, unsigned long long int, and float - -__inline__ __device__ int atomic_fetch_add(volatile int* const dest, - const int val) { - return atomicAdd((int*)dest, val); -} - -__inline__ __device__ unsigned int atomic_fetch_add( - volatile unsigned int* const dest, const unsigned int val) { - return atomicAdd((unsigned int*)dest, val); -} - -__inline__ __device__ unsigned long long int atomic_fetch_add( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return atomicAdd((unsigned long long int*)dest, val); -} - -__inline__ __device__ float atomic_fetch_add(volatile float* const dest, - const float val) { - return atomicAdd((float*)dest, val); -} - -#if (600 <= __CUDA_ARCH__) -__inline__ __device__ double atomic_fetch_add(volatile double* const dest, - const double val) { - return atomicAdd((double*)dest, val); -} -#endif - -template -__inline__ __device__ T -atomic_fetch_add(volatile T* const dest, - std::enable_if_t val) { - // to work around a bug in the clang cuda compiler, the name here needs to be - // different from the one internal to the other overloads - union U1 { - int i; - T t; - KOKKOS_INLINE_FUNCTION U1() {} - } assume, oldval, newval; - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t + val; - oldval.i = atomicCAS((int*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -template -__inline__ __device__ T atomic_fetch_add( - volatile T* const dest, - std::enable_if_t - val) { - // to work around a bug in the clang cuda compiler, the name here needs to be - // different from the one internal to the other overloads - union U2 { - unsigned long long int i; - T t; - KOKKOS_INLINE_FUNCTION U2() {} - } assume, oldval, newval; - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t + val; - oldval.i = atomicCAS((unsigned long long int*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -//---------------------------------------------------------------------------- - -template -__inline__ __device__ T atomic_fetch_add( - volatile T* const dest, - std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) { - T return_val; - // This is a way to (hopefully) avoid dead lock in a warp - int done = 0; - unsigned int mask = __activemask(); - unsigned int active = __ballot_sync(mask, 1); - unsigned int done_active = 0; - while (active != done_active) { - if (!done) { - bool locked = Impl::lock_address_cuda_space((void*)dest); - if (locked) { - Kokkos::memory_fence(); - return_val = *dest; - *dest = return_val + val; - Kokkos::memory_fence(); - Impl::unlock_address_cuda_space((void*)dest); - done = 1; - } - } - - done_active = __ballot_sync(mask, done); - } - return return_val; -} -#endif -#endif -//---------------------------------------------------------------------------- -#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) - -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) -inline int atomic_fetch_add(volatile int* dest, const int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - int original = val; - - __asm__ __volatile__("lock xadd %1, %0" - : "+m"(*dest), "+r"(original) - : "m"(*dest), "r"(original) - : "memory"); - - return original; -} -#else -inline int atomic_fetch_add(volatile int* const dest, const int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_add(dest, val); -} -#endif - -inline long int atomic_fetch_add(volatile long int* const dest, - const long int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_add(dest, val); -} - -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) - -inline unsigned int atomic_fetch_add(volatile unsigned int* const dest, - const unsigned int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_add(dest, val); -} - -inline unsigned long int atomic_fetch_add( - volatile unsigned long int* const dest, const unsigned long int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_add(dest, val); -} - -inline unsigned long long int atomic_fetch_add( - volatile unsigned long long int* const dest, - const unsigned long long int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_add(dest, val); -} - -#endif - -template -inline T atomic_fetch_add( - volatile T* const dest, - std::enable_if_t val) { - union U { - int i; - T t; - inline U() {} - } assume, oldval, newval; - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t + val; - oldval.i = __sync_val_compare_and_swap((int*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -template -inline T atomic_fetch_add( - volatile T* const dest, - std::enable_if_t - val) { - union U { - long i; - T t; - inline U() {} - } assume, oldval, newval; - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t + val; - oldval.i = __sync_val_compare_and_swap((long*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) -template -inline T atomic_fetch_add( - volatile T* const dest, - std::enable_if_t - val) { - union U { - Impl::cas128_t i; - T t; - inline U() {} - } assume, oldval, newval; - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t + val; - oldval.i = Impl::cas128((volatile Impl::cas128_t*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} -#endif - -//---------------------------------------------------------------------------- - -template -inline T atomic_fetch_add(volatile T* const dest, - std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8) -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - && (sizeof(T) != 16) -#endif - , - const T>& val) { - while (!Impl::lock_address_host_space((void*)dest)) - ; - Kokkos::memory_fence(); - T return_val = *dest; - - // Don't use the following line of code here: - // - // const T tmp = *dest = return_val + val; - // - // Instead, put each assignment in its own statement. This is - // because the overload of T::operator= for volatile *this should - // return void, not volatile T&. See Kokkos #177: - // - // https://github.com/kokkos/kokkos/issues/177 - *dest = return_val + val; - const T tmp = *dest; - (void)tmp; - Kokkos::memory_fence(); - Impl::unlock_address_host_space((void*)dest); - - return return_val; -} -//---------------------------------------------------------------------------- - -#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) - -template -T atomic_fetch_add(volatile T* const dest, const T val) { - T retval; -#pragma omp atomic capture - { - retval = dest[0]; - dest[0] += val; - } - return retval; -} - -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - -template -T atomic_fetch_add(volatile T* const dest_v, std::add_const_t val) { - T* dest = const_cast(dest_v); - T retval = *dest; - *dest += val; - return retval; -} - -#endif -#endif -//---------------------------------------------------------------------------- - -// dummy for non-CUDA Kokkos headers being processed by NVCC -#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) -template -__inline__ __device__ T atomic_fetch_add(volatile T* const, - Kokkos::Impl::type_identity_t) { - return T(); -} -#endif - -} // namespace Kokkos -#endif diff --git a/core/src/impl/Kokkos_Atomic_Fetch_And.hpp b/core/src/impl/Kokkos_Atomic_Fetch_And.hpp deleted file mode 100644 index 25049db8f0..0000000000 --- a/core/src/impl/Kokkos_Atomic_Fetch_And.hpp +++ /dev/null @@ -1,164 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) -#include -#endif - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_AND_HPP) -#define KOKKOS_ATOMIC_FETCH_AND_HPP - -namespace Kokkos { - -//---------------------------------------------------------------------------- - -#if defined(KOKKOS_ENABLE_CUDA) -#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) - -// Support for int, unsigned int, unsigned long long int, and float - -__inline__ __device__ int atomic_fetch_and(volatile int* const dest, - const int val) { - return atomicAnd((int*)dest, val); -} - -__inline__ __device__ unsigned int atomic_fetch_and( - volatile unsigned int* const dest, const unsigned int val) { - return atomicAnd((unsigned int*)dest, val); -} - -#if defined(__CUDA_ARCH__) && (350 <= __CUDA_ARCH__) -__inline__ __device__ unsigned long long int atomic_fetch_and( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return atomicAnd((unsigned long long int*)dest, val); -} -#endif -#endif -#endif - -// 08/05/20 Overload to work around https://bugs.llvm.org/show_bug.cgi?id=46922 - -#if (defined(KOKKOS_ENABLE_CUDA) && \ - (defined(__CUDA_ARCH__) || \ - defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND))) || \ - (defined(KOKKOS_ENABLE_HIP)) -__inline__ __device__ unsigned long atomic_fetch_and( - volatile unsigned long* const dest, const unsigned long val) { - return atomic_fetch_and(dest, val); -} -__inline__ __device__ long atomic_fetch_and(volatile long* const dest, - long val) { - return atomic_fetch_and(dest, val); -} -#endif - -//---------------------------------------------------------------------------- -#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) - -inline int atomic_fetch_and(volatile int* const dest, const int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_and(dest, val); -} - -inline long int atomic_fetch_and(volatile long int* const dest, - const long int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_and(dest, val); -} - -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) - -inline unsigned int atomic_fetch_and(volatile unsigned int* const dest, - const unsigned int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_and(dest, val); -} - -inline unsigned long int atomic_fetch_and( - volatile unsigned long int* const dest, const unsigned long int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_and(dest, val); -} - -inline unsigned long long int atomic_fetch_and( - volatile unsigned long long int* const dest, - const unsigned long long int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_and(dest, val); -} - -#endif - -//---------------------------------------------------------------------------- - -#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) - -template -T atomic_fetch_and(volatile T* const dest, const T val) { - T retval; -#pragma omp atomic capture - { - retval = dest[0]; - dest[0] &= val; - } - return retval; -} - -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - -template -T atomic_fetch_and(volatile T* const dest_v, const T val) { - T* dest = const_cast(dest_v); - T retval = *dest; - *dest &= val; - return retval; -} - -#endif -#endif -//---------------------------------------------------------------------------- - -// dummy for non-CUDA Kokkos headers being processed by NVCC -#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) -template -__inline__ __device__ T atomic_fetch_and(volatile T* const, - Kokkos::Impl::type_identity_t) { - return T(); -} -#endif - -// Simpler version of atomic_fetch_and without the fetch -template -KOKKOS_INLINE_FUNCTION void atomic_and(volatile T* const dest, const T src) { - (void)atomic_fetch_and(dest, src); -} - -} // namespace Kokkos - -#endif diff --git a/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp b/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp deleted file mode 100644 index fa581bc155..0000000000 --- a/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp +++ /dev/null @@ -1,165 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) -#include -#endif - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_OR_HPP) -#define KOKKOS_ATOMIC_FETCH_OR_HPP - -namespace Kokkos { - -//---------------------------------------------------------------------------- - -#if defined(KOKKOS_ENABLE_CUDA) -#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) - -// Support for int, unsigned int, unsigned long long int, and float - -__inline__ __device__ int atomic_fetch_or(volatile int* const dest, - const int val) { - return atomicOr((int*)dest, val); -} - -__inline__ __device__ unsigned int atomic_fetch_or( - volatile unsigned int* const dest, const unsigned int val) { - return atomicOr((unsigned int*)dest, val); -} - -#if defined(__CUDA_ARCH__) && (350 <= __CUDA_ARCH__) -__inline__ __device__ unsigned long long int atomic_fetch_or( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return atomicOr((unsigned long long int*)dest, val); -} -#endif -#endif -#endif - -// 08/05/20 Overload to work around https://bugs.llvm.org/show_bug.cgi?id=46922 - -#if (defined(KOKKOS_ENABLE_CUDA) && \ - (defined(__CUDA_ARCH__) || \ - defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND))) || \ - (defined(KOKKOS_ENABLE_HIP)) -__inline__ __device__ unsigned long atomic_fetch_or( - volatile unsigned long* const dest, const unsigned long val) { - return atomic_fetch_or(dest, val); -} - -__inline__ __device__ long atomic_fetch_or(volatile long* const dest, - long val) { - return atomic_fetch_or(dest, val); -} -#endif - -//---------------------------------------------------------------------------- -#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) - -inline int atomic_fetch_or(volatile int* const dest, const int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_or(dest, val); -} - -inline long int atomic_fetch_or(volatile long int* const dest, - const long int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_or(dest, val); -} - -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) - -inline unsigned int atomic_fetch_or(volatile unsigned int* const dest, - const unsigned int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_or(dest, val); -} - -inline unsigned long int atomic_fetch_or(volatile unsigned long int* const dest, - const unsigned long int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_or(dest, val); -} - -inline unsigned long long int atomic_fetch_or( - volatile unsigned long long int* const dest, - const unsigned long long int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_or(dest, val); -} - -#endif - -//---------------------------------------------------------------------------- - -#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) - -template -T atomic_fetch_or(volatile T* const dest, const T val) { - T retval; -#pragma omp atomic capture - { - retval = dest[0]; - dest[0] |= val; - } - return retval; -} - -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - -template -T atomic_fetch_or(volatile T* const dest_v, const T val) { - T* dest = const_cast(dest_v); - T retval = *dest; - *dest |= val; - return retval; -} - -#endif -#endif -//---------------------------------------------------------------------------- - -// dummy for non-CUDA Kokkos headers being processed by NVCC -#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) -template -__inline__ __device__ T atomic_fetch_or(volatile T* const, - Kokkos::Impl::type_identity_t) { - return T(); -} -#endif - -// Simpler version of atomic_fetch_or without the fetch -template -KOKKOS_INLINE_FUNCTION void atomic_or(volatile T* const dest, const T src) { - (void)atomic_fetch_or(dest, src); -} - -} // namespace Kokkos - -#endif diff --git a/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp deleted file mode 100644 index a4db7d7cf4..0000000000 --- a/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp +++ /dev/null @@ -1,295 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) -#include -#endif - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_SUB_HPP) -#define KOKKOS_ATOMIC_FETCH_SUB_HPP - -namespace Kokkos { - -//---------------------------------------------------------------------------- - -#if defined(KOKKOS_ENABLE_CUDA) -#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) - -// Support for int, unsigned int, unsigned long long int, and float - -__inline__ __device__ int atomic_fetch_sub(volatile int* const dest, - const int val) { - return atomicSub((int*)dest, val); -} - -__inline__ __device__ unsigned int atomic_fetch_sub( - volatile unsigned int* const dest, const unsigned int val) { - return atomicSub((unsigned int*)dest, val); -} - -__inline__ __device__ unsigned int atomic_fetch_sub( - volatile int64_t* const dest, const int64_t val) { - return atomic_fetch_add(dest, -val); -} - -__inline__ __device__ unsigned int atomic_fetch_sub(volatile float* const dest, - const float val) { - return atomicAdd((float*)dest, -val); -} - -#if (600 <= __CUDA_ARCH__) -__inline__ __device__ unsigned int atomic_fetch_sub(volatile double* const dest, - const double val) { - return atomicAdd((double*)dest, -val); -} -#endif - -template -__inline__ __device__ T -atomic_fetch_sub(volatile T* const dest, - std::enable_if_t val) { - union U { - int i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } oldval, assume, newval; - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t - val; - oldval.i = atomicCAS((int*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -template -__inline__ __device__ T atomic_fetch_sub( - volatile T* const dest, - std::enable_if_t - val) { - union U { - unsigned long long int i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } oldval, assume, newval; - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t - val; - oldval.i = atomicCAS((unsigned long long int*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -//---------------------------------------------------------------------------- - -template -__inline__ __device__ T atomic_fetch_sub( - volatile T* const dest, - std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) { - T return_val; - // This is a way to (hopefully) avoid dead lock in a warp - int done = 0; - unsigned int mask = __activemask(); - unsigned int active = __ballot_sync(mask, 1); - unsigned int done_active = 0; - while (active != done_active) { - if (!done) { - if (Impl::lock_address_cuda_space((void*)dest)) { - Kokkos::memory_fence(); - return_val = *dest; - *dest = return_val - val; - Kokkos::memory_fence(); - Impl::unlock_address_cuda_space((void*)dest); - done = 1; - } - } - done_active = __ballot_sync(mask, done); - } - return return_val; -} -#endif -#endif -//---------------------------------------------------------------------------- -#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) - -inline int atomic_fetch_sub(volatile int* const dest, const int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_sub(dest, val); -} - -inline long int atomic_fetch_sub(volatile long int* const dest, - const long int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_sub(dest, val); -} - -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) - -inline unsigned int atomic_fetch_sub(volatile unsigned int* const dest, - const unsigned int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_sub(dest, val); -} - -inline unsigned long int atomic_fetch_sub( - volatile unsigned long int* const dest, const unsigned long int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_sub(dest, val); -} - -inline unsigned long long int atomic_fetch_sub( - volatile unsigned long long int* const dest, - const unsigned long long int val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - return __sync_fetch_and_sub(dest, val); -} - -#endif - -template -inline T atomic_fetch_sub( - volatile T* const dest, - std::enable_if_t val) { - union U { - int i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } oldval, assume, newval; - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t - val; - oldval.i = __sync_val_compare_and_swap((int*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -template -inline T atomic_fetch_sub( - volatile T* const dest, - std::enable_if_t - val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - union U { - long i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } oldval, assume, newval; - - oldval.t = *dest; - - do { - assume.i = oldval.i; - newval.t = assume.t - val; - oldval.i = __sync_val_compare_and_swap((long*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -//---------------------------------------------------------------------------- - -template -inline T atomic_fetch_sub( - volatile T* const dest, - std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) { -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)dest, _MM_HINT_ET0); -#endif - - while (!Impl::lock_address_host_space((void*)dest)) - ; - Kokkos::memory_fence(); - T return_val = *dest; - *dest = return_val - val; - Kokkos::memory_fence(); - Impl::unlock_address_host_space((void*)dest); - return return_val; -} - -//---------------------------------------------------------------------------- - -#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) - -template -T atomic_fetch_sub(volatile T* const dest, const T val) { - T retval; -#pragma omp atomic capture - { - retval = dest[0]; - dest[0] -= val; - } - return retval; -} - -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - -template -T atomic_fetch_sub(volatile T* const dest_v, const T val) { - T* dest = const_cast(dest_v); - T retval = *dest; - *dest -= val; - return retval; -} - -#endif -#endif - -// dummy for non-CUDA Kokkos headers being processed by NVCC -#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) -template -__inline__ __device__ T atomic_fetch_sub(volatile T* const, - Kokkos::Impl::type_identity_t) { - return T(); -} -#endif - -} // namespace Kokkos - -#include -#endif diff --git a/core/src/impl/Kokkos_Atomic_Generic.hpp b/core/src/impl/Kokkos_Atomic_Generic.hpp deleted file mode 100644 index 69d101fb8e..0000000000 --- a/core/src/impl/Kokkos_Atomic_Generic.hpp +++ /dev/null @@ -1,527 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_GENERIC_HPP) -#define KOKKOS_ATOMIC_GENERIC_HPP -#include - -// Combination operands to be used in an Compare and Exchange based atomic -// operation -namespace Kokkos { -namespace Impl { - -template -struct _check_early_exit_impl { - KOKKOS_FORCEINLINE_FUNCTION - static constexpr bool check(Op const&, Scalar1 const&, - Scalar2 const&) noexcept { - return false; - } -}; - -template -struct _check_early_exit_impl< - Op, Scalar1, Scalar2, - decltype(std::declval().check_early_exit( - std::declval(), std::declval()))> { - KOKKOS_FORCEINLINE_FUNCTION - static constexpr bool check(Op const& op, Scalar1 const& v1, - Scalar2 const& v2) { - return op.check_early_exit(v1, v2); - } -}; - -template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool check_early_exit( - Op const& op, Scalar1 const& v1, Scalar2 const& v2) noexcept { - return _check_early_exit_impl::check(op, v1, v2); -} - -template -struct MaxOper { - KOKKOS_FORCEINLINE_FUNCTION - static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { - return (val1 > val2 ? val1 : val2); - } - KOKKOS_FORCEINLINE_FUNCTION - static constexpr bool check_early_exit(Scalar1 const& val1, - Scalar2 const& val2) noexcept { - return (val1 > val2); - } -}; - -template -struct MinOper { - KOKKOS_FORCEINLINE_FUNCTION - static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { - return (val1 < val2 ? val1 : val2); - } - KOKKOS_FORCEINLINE_FUNCTION - static constexpr bool check_early_exit(Scalar1 const& val1, - Scalar2 const& val2) noexcept { - return (val1 < val2); - } -}; - -template -struct AddOper { - KOKKOS_FORCEINLINE_FUNCTION - static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { - return val1 + val2; - } -}; - -template -struct SubOper { - KOKKOS_FORCEINLINE_FUNCTION - static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { - return val1 - val2; - } -}; - -template -struct MulOper { - KOKKOS_FORCEINLINE_FUNCTION - static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { - return val1 * val2; - } -}; - -template -struct DivOper { - KOKKOS_FORCEINLINE_FUNCTION - static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { - return val1 / val2; - } -}; - -template -struct ModOper { - KOKKOS_FORCEINLINE_FUNCTION - static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { - return val1 % val2; - } -}; - -template -struct AndOper { - KOKKOS_FORCEINLINE_FUNCTION - static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { - return val1 & val2; - } -}; - -template -struct OrOper { - KOKKOS_FORCEINLINE_FUNCTION - static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { - return val1 | val2; - } -}; - -template -struct XorOper { - KOKKOS_FORCEINLINE_FUNCTION - static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { - return val1 ^ val2; - } -}; - -template -struct LShiftOper { - KOKKOS_FORCEINLINE_FUNCTION - static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { - return val1 << val2; - } -}; - -template -struct RShiftOper { - KOKKOS_FORCEINLINE_FUNCTION - static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { - return val1 >> val2; - } -}; - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_oper( - const Oper& op, volatile T* const dest, - std::enable_if_t - val) { - union U { - unsigned long long int i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } oldval, assume, newval; - - oldval.t = *dest; - - do { - if (check_early_exit(op, oldval.t, val)) return oldval.t; - assume.i = oldval.i; - newval.t = op.apply(assume.t, val); - oldval.i = Kokkos::atomic_compare_exchange((unsigned long long int*)dest, - assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -template -KOKKOS_INLINE_FUNCTION T atomic_oper_fetch( - const Oper& op, volatile T* const dest, - std::enable_if_t - val) { - union U { - unsigned long long int i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } oldval, assume, newval; - - oldval.t = *dest; - - do { - if (check_early_exit(op, oldval.t, val)) return oldval.t; - assume.i = oldval.i; - newval.t = op.apply(assume.t, val); - oldval.i = Kokkos::atomic_compare_exchange((unsigned long long int*)dest, - assume.i, newval.i); - } while (assume.i != oldval.i); - - return newval.t; -} - -template -KOKKOS_INLINE_FUNCTION T -atomic_fetch_oper(const Oper& op, volatile T* const dest, - std::enable_if_t val) { - union U { - int i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } oldval, assume, newval; - - oldval.t = *dest; - - do { - if (check_early_exit(op, oldval.t, val)) return oldval.t; - assume.i = oldval.i; - newval.t = op.apply(assume.t, val); - oldval.i = Kokkos::atomic_compare_exchange((int*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); - - return oldval.t; -} - -template -KOKKOS_INLINE_FUNCTION T -atomic_oper_fetch(const Oper& op, volatile T* const dest, - std::enable_if_t val) { - union U { - int i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } oldval, assume, newval; - - oldval.t = *dest; - - do { - if (check_early_exit(op, oldval.t, val)) return oldval.t; - assume.i = oldval.i; - newval.t = op.apply(assume.t, val); - oldval.i = Kokkos::atomic_compare_exchange((int*)dest, assume.i, newval.i); - } while (assume.i != oldval.i); - - return newval.t; -} - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_oper( - const Oper& op, volatile T* const dest, - std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T> val) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - while (!Impl::lock_address_host_space((void*)dest)) - ; - Kokkos::memory_fence(); - T return_val = *dest; - *dest = op.apply(return_val, val); - Kokkos::memory_fence(); - Impl::unlock_address_host_space((void*)dest); - return return_val; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) - // This is a way to (hopefully) avoid dead lock in a warp - T return_val; - int done = 0; - unsigned int mask = __activemask(); - unsigned int active = __ballot_sync(mask, 1); - unsigned int done_active = 0; - while (active != done_active) { - if (!done) { - if (Impl::lock_address_cuda_space((void*)dest)) { - Kokkos::memory_fence(); - return_val = *dest; - *dest = op.apply(return_val, val); - Kokkos::memory_fence(); - Impl::unlock_address_cuda_space((void*)dest); - done = 1; - } - } - done_active = __ballot_sync(mask, done); - } - return return_val; -#elif defined(__HIP_DEVICE_COMPILE__) - T return_val = *dest; - int done = 0; - unsigned int active = __ballot(1); - unsigned int done_active = 0; - while (active != done_active) { - if (!done) { - if (Impl::lock_address_hip_space((void*)dest)) { - return_val = *dest; - *dest = op.apply(return_val, val); - Impl::unlock_address_hip_space((void*)dest); - done = 1; - } - } - done_active = __ballot(done); - } - return return_val; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - // FIXME_SYCL - Kokkos::abort("Not implemented!"); - (void)op; - (void)dest; - (void)val; - return 0; -#endif -} - -template -KOKKOS_INLINE_FUNCTION T -atomic_oper_fetch(const Oper& op, volatile T* const dest, - std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8) -#if defined(KOKKOS_ENABLE_ASM) && \ - defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) - && (sizeof(T) != 16) -#endif - , - const T>& val) { - -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - while (!Impl::lock_address_host_space((void*)dest)) - ; - Kokkos::memory_fence(); - T return_val = op.apply(*dest, val); - *dest = return_val; - Kokkos::memory_fence(); - Impl::unlock_address_host_space((void*)dest); - return return_val; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) - T return_val; - // This is a way to (hopefully) avoid dead lock in a warp - int done = 0; - unsigned int mask = __activemask(); - unsigned int active = __ballot_sync(mask, 1); - unsigned int done_active = 0; - while (active != done_active) { - if (!done) { - if (Impl::lock_address_cuda_space((void*)dest)) { - Kokkos::memory_fence(); - return_val = op.apply(*dest, val); - *dest = return_val; - Kokkos::memory_fence(); - Impl::unlock_address_cuda_space((void*)dest); - done = 1; - } - } - done_active = __ballot_sync(mask, done); - } - return return_val; -#elif defined(__HIP_DEVICE_COMPILE__) - T return_val; - int done = 0; - unsigned int active = __ballot(1); - unsigned int done_active = 0; - while (active != done_active) { - if (!done) { - if (Impl::lock_address_hip_space((void*)dest)) { - return_val = op.apply(*dest, val); - *dest = return_val; - Impl::unlock_address_hip_space((void*)dest); - done = 1; - } - } - done_active = __ballot(done); - } - return return_val; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - // FIXME_SYCL - std::abort(); - (void)op; - (void)dest; - (void)val; - return 0; -#endif -} - -} // namespace Impl -} // namespace Kokkos - -namespace Kokkos { - -// Fetch_Oper atomics: return value before operation -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_max(volatile T* const dest, const T val) { - return Impl::atomic_fetch_oper(Impl::MaxOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_min(volatile T* const dest, const T val) { - return Impl::atomic_fetch_oper(Impl::MinOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_mul(volatile T* const dest, const T val) { - return Impl::atomic_fetch_oper(Impl::MulOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_div(volatile T* const dest, const T val) { - return Impl::atomic_fetch_oper(Impl::DivOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_mod(volatile T* const dest, const T val) { - return Impl::atomic_fetch_oper(Impl::ModOper(), dest, val); -} - -#if !defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_and(volatile T* const dest, const T val) { - return Impl::atomic_fetch_oper(Impl::AndOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_or(volatile T* const dest, const T val) { - return Impl::atomic_fetch_oper(Impl::OrOper(), dest, val); -} - -#endif - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_xor(volatile T* const dest, const T val) { - return Impl::atomic_fetch_oper(Impl::XorOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_lshift(volatile T* const dest, - const unsigned int val) { - return Impl::atomic_fetch_oper(Impl::LShiftOper(), - dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_rshift(volatile T* const dest, - const unsigned int val) { - return Impl::atomic_fetch_oper(Impl::RShiftOper(), - dest, val); -} - -// Oper Fetch atomics: return value after operation -template -KOKKOS_INLINE_FUNCTION T atomic_max_fetch(volatile T* const dest, const T val) { - return Impl::atomic_oper_fetch(Impl::MaxOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_min_fetch(volatile T* const dest, const T val) { - return Impl::atomic_oper_fetch(Impl::MinOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_mul_fetch(volatile T* const dest, const T val) { - return Impl::atomic_oper_fetch(Impl::MulOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_div_fetch(volatile T* const dest, const T val) { - return Impl::atomic_oper_fetch(Impl::DivOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_mod_fetch(volatile T* const dest, const T val) { - return Impl::atomic_oper_fetch(Impl::ModOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_and_fetch(volatile T* const dest, const T val) { - return Impl::atomic_oper_fetch(Impl::AndOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_or_fetch(volatile T* const dest, const T val) { - return Impl::atomic_oper_fetch(Impl::OrOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_xor_fetch(volatile T* const dest, const T val) { - return Impl::atomic_oper_fetch(Impl::XorOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_lshift_fetch(volatile T* const dest, - const unsigned int val) { - return Impl::atomic_oper_fetch(Impl::LShiftOper(), - dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_rshift_fetch(volatile T* const dest, - const unsigned int val) { - return Impl::atomic_oper_fetch(Impl::RShiftOper(), - dest, val); -} - -#ifdef _WIN32 -template -KOKKOS_INLINE_FUNCTION T atomic_add_fetch(volatile T* const dest, const T val) { - return Impl::atomic_oper_fetch(Impl::AddOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_sub_fetch(volatile T* const dest, const T val) { - return Impl::atomic_oper_fetch(Impl::SubOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_add(volatile T* const dest, const T val) { - return Impl::atomic_fetch_oper(Impl::AddOper(), dest, val); -} - -template -KOKKOS_INLINE_FUNCTION T atomic_fetch_sub(volatile T* const dest, const T val) { - return Impl::atomic_fetch_oper(Impl::SubOper(), dest, val); -} -#endif - -} // namespace Kokkos -#endif diff --git a/core/src/impl/Kokkos_Atomic_Generic_Secondary.hpp b/core/src/impl/Kokkos_Atomic_Generic_Secondary.hpp deleted file mode 100644 index af43bf6679..0000000000 --- a/core/src/impl/Kokkos_Atomic_Generic_Secondary.hpp +++ /dev/null @@ -1,58 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_GENERIC_SECONDARY_HPP) -#define KOKKOS_ATOMIC_GENERIC_SECONDARY_HPP -#include - -namespace Kokkos { - -#ifndef KOKKOS_ENABLE_SERIAL_ATOMICS -template -KOKKOS_INLINE_FUNCTION T atomic_exchange(volatile T* const dest, const T val) { - T oldval = *dest; - T assume; - do { - assume = oldval; - oldval = atomic_compare_exchange(dest, assume, val); - } while (assume != oldval); - - return oldval; -} -#endif - -template -KOKKOS_INLINE_FUNCTION void atomic_add(volatile T* const dest, const T val) { - (void)atomic_fetch_add(dest, val); -} - -template -KOKKOS_INLINE_FUNCTION void atomic_sub(volatile T* const dest, const T val) { - (void)atomic_fetch_sub(dest, val); -} - -template -KOKKOS_INLINE_FUNCTION void atomic_mul(volatile T* const dest, const T val) { - (void)atomic_fetch_mul(dest, val); -} - -template -KOKKOS_INLINE_FUNCTION void atomic_div(volatile T* const dest, const T val) { - (void)atomic_fetch_div(dest, val); -} - -} // namespace Kokkos -#endif diff --git a/core/src/impl/Kokkos_Atomic_Increment.hpp b/core/src/impl/Kokkos_Atomic_Increment.hpp deleted file mode 100644 index b40e7dfecb..0000000000 --- a/core/src/impl/Kokkos_Atomic_Increment.hpp +++ /dev/null @@ -1,119 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) -#include -#endif - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_INCREMENT_HPP) -#define KOKKOS_ATOMIC_INCREMENT_HPP - -namespace Kokkos { - -// Atomic increment -template <> -KOKKOS_INLINE_FUNCTION void atomic_increment(volatile char* a) { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ - !defined(_WIN32) && !defined(__CUDA_ARCH__) -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)a, _MM_HINT_ET0); -#endif - __asm__ __volatile__("lock incb %0" - : /* no output registers */ - : "m"(a[0]) - : "memory"); -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - char* a_nv = const_cast(a); - ++(*a_nv); -#else - Kokkos::atomic_fetch_add(a, char(1)); -#endif -} - -template <> -KOKKOS_INLINE_FUNCTION void atomic_increment(volatile short* a) { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ - !defined(_WIN32) && !defined(__CUDA_ARCH__) -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)a, _MM_HINT_ET0); -#endif - __asm__ __volatile__("lock incw %0" - : /* no output registers */ - : "m"(a[0]) - : "memory"); -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - short* a_nv = const_cast(a); - ++(*a_nv); -#else - Kokkos::atomic_fetch_add(a, short(1)); -#endif -} - -#ifndef _WIN32 -template <> -KOKKOS_INLINE_FUNCTION void atomic_increment(volatile int* a) { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ - !defined(_WIN32) && !defined(__CUDA_ARCH__) -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)a, _MM_HINT_ET0); -#endif - __asm__ __volatile__("lock incl %0" - : /* no output registers */ - : "m"(a[0]) - : "memory"); -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - int* a_nv = const_cast(a); - ++(*a_nv); -#else - Kokkos::atomic_fetch_add(a, int(1)); -#endif -} -#endif - -template <> -KOKKOS_INLINE_FUNCTION void atomic_increment( - volatile long long int* a) { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ - !defined(_WIN32) && !defined(__CUDA_ARCH__) -#if defined(KOKKOS_ENABLE_RFO_PREFETCH) - _mm_prefetch((const char*)a, _MM_HINT_ET0); -#endif - __asm__ __volatile__("lock incq %0" - : /* no output registers */ - : "m"(a[0]) - : "memory"); -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - long long int* a_nv = const_cast(a); - ++(*a_nv); -#else - using T = long long int; - Kokkos::atomic_fetch_add(a, T(1)); -#endif -} - -template -KOKKOS_INLINE_FUNCTION void atomic_increment(volatile T* a) { -#if defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - T* a_nv = const_cast(a); - ++(*a_nv); -#else - Kokkos::atomic_fetch_add(a, T(1)); -#endif -} - -} // End of namespace Kokkos -#endif diff --git a/core/src/impl/Kokkos_Atomic_Load.hpp b/core/src/impl/Kokkos_Atomic_Load.hpp deleted file mode 100644 index fc4a04b501..0000000000 --- a/core/src/impl/Kokkos_Atomic_Load.hpp +++ /dev/null @@ -1,201 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_KOKKOS_ATOMIC_LOAD_HPP -#define KOKKOS_IMPL_KOKKOS_ATOMIC_LOAD_HPP - -#include -#if defined(KOKKOS_ATOMIC_HPP) - -#include -#include - -#if defined(KOKKOS_ENABLE_CUDA) -#include -#endif - -namespace Kokkos { -namespace Impl { - -// Olivier's implementation helpfully binds to the same builtins as GNU, so -// we make this code common across multiple options -#if (defined(KOKKOS_ENABLE_GNU_ATOMICS) && !defined(__CUDA_ARCH__)) || \ - (defined(KOKKOS_ENABLE_INTEL_ATOMICS) && !defined(__CUDA_ARCH__)) || \ - defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) - -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) -#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH __inline__ __device__ -#else -#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH inline -#endif - -template -KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH T _atomic_load( - T* ptr, MemoryOrder, - std::enable_if_t<(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || - sizeof(T) == 8) && - std::is_same>::value, - void const**> = nullptr) { - return __atomic_load_n(ptr, MemoryOrder::gnu_constant); -} - -template -KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH T _atomic_load( - T* ptr, MemoryOrder, - std::enable_if_t::value && - std::is_same>::value, - void const**> = nullptr) { - T rv{}; - __atomic_load(ptr, &rv, MemoryOrder::gnu_constant); - return rv; -} - -#undef KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH - -#elif defined(__CUDA_ARCH__) - -// Not compiling for Volta or later, or Cuda ASM atomics were manually disabled - -template -__device__ __inline__ T _relaxed_atomic_load_impl( - T* ptr, std::enable_if_t<(sizeof(T) == 1 || sizeof(T) == 2 || - sizeof(T) == 4 || sizeof(T) == 8), - void const**> = nullptr) { - return *ptr; -} - -template -struct NoOpOper { - __device__ __inline__ static constexpr T apply(T const& t, - T const&) noexcept { - return t; - } -}; - -template -__device__ __inline__ T _relaxed_atomic_load_impl( - T* ptr, std::enable_if_t = nullptr) { - T rv{}; - // TODO remove a copy operation here? - return Kokkos::Impl::atomic_oper_fetch(NoOpOper{}, ptr, rv); -} - -template -__device__ __inline__ T _atomic_load(T* ptr, memory_order_seq_cst_t) { - Kokkos::memory_fence(); - T rv = Impl::_relaxed_atomic_load_impl(ptr); - Kokkos::memory_fence(); - return rv; -} - -template -__device__ __inline__ T _atomic_load(T* ptr, memory_order_acquire_t) { - T rv = Impl::_relaxed_atomic_load_impl(ptr); - Kokkos::memory_fence(); - return rv; -} - -template -__device__ __inline__ T _atomic_load(T* ptr, memory_order_relaxed_t) { - return _relaxed_atomic_load_impl(ptr); -} - -#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) - -template -inline T _atomic_load(T* ptr, MemoryOrder) { - // AFAICT, all OpenMP atomics are sequentially consistent, so memory order - // doesn't matter - T retval{}; -#pragma omp atomic read - { retval = *ptr; } - return retval; -} - -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - -template -inline T _atomic_load(T* ptr, MemoryOrder) { - return *ptr; -} - -#elif defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) - -template -inline T _atomic_load(T* ptr, MemoryOrder) { - atomic_compare_exchange(ptr, 0, 0); - return *ptr; -} - -#endif // end of all atomic implementations - -template -KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* ptr, - Impl::memory_order_seq_cst_t) { - return _atomic_load(ptr, Impl::memory_order_seq_cst); -} - -template -KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* ptr, - Impl::memory_order_acquire_t) { - return _atomic_load(ptr, Impl::memory_order_acquire); -} - -template -KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* ptr, - Impl::memory_order_relaxed_t) { - return _atomic_load(ptr, Impl::memory_order_relaxed); -} - -template -KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* /*ptr*/, - Impl::memory_order_release_t) { - static_assert( - sizeof(T) == 0, // just something that will always be false, but only on - // instantiation - "atomic_load with memory order release doesn't make any sense!"); -} - -template -KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* /*ptr*/, - Impl::memory_order_acq_rel_t) { - static_assert( - sizeof(T) == 0, // just something that will always be false, but only on - // instantiation - "atomic_load with memory order acq_rel doesn't make any sense!"); -} - -template -KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* ptr) { - // relaxed by default! - return _atomic_load(ptr, Impl::memory_order_relaxed); -} - -} // end namespace Impl -} // end namespace Kokkos - -#if defined(KOKKOS_ENABLE_CUDA) -#include -#endif - -#endif // defined(KOKKOS_ATOMIC_HPP) -#endif // KOKKOS_IMPL_KOKKOS_ATOMIC_LOAD_HPP diff --git a/core/src/impl/Kokkos_Atomic_MinMax.hpp b/core/src/impl/Kokkos_Atomic_MinMax.hpp deleted file mode 100644 index 42898c82a4..0000000000 --- a/core/src/impl/Kokkos_Atomic_MinMax.hpp +++ /dev/null @@ -1,291 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_MINMAX_HPP) -#define KOKKOS_ATOMIC_MINMAX_HPP - -namespace Kokkos { - -//---------------------------------------------------------------------------- - -#if defined(KOKKOS_ENABLE_CUDA) -#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) - -// Support for int, unsigned int, unsigned long long int, and float - -// Atomic_fetch_{min,max} - -#ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND - -// Host implementations for CLANG compiler - -inline __host__ int atomic_fetch_min(volatile int* const dest, const int val) { - return Impl::atomic_fetch_oper(Impl::MinOper(), dest, - val); -} - -inline __host__ unsigned int atomic_fetch_min(volatile unsigned int* const dest, - const unsigned int val) { - return Impl::atomic_fetch_oper( - Impl::MinOper(), dest, val); -} - -inline __host__ unsigned long long int atomic_fetch_min( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return Impl::atomic_fetch_oper(Impl::MinOper(), - dest, val); -} - -inline __host__ int atomic_fetch_max(volatile int* const dest, const int val) { - return Impl::atomic_fetch_oper(Impl::MaxOper(), dest, - val); -} - -inline __host__ unsigned int atomic_fetch_max(volatile unsigned int* const dest, - const unsigned int val) { - return Impl::atomic_fetch_oper( - Impl::MaxOper(), dest, val); -} - -inline __host__ unsigned long long int atomic_fetch_max( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return Impl::atomic_fetch_oper(Impl::MaxOper(), - dest, val); -} - -#endif - -#if (350 > __CUDA_ARCH__) - -// Fallback for atomic{Min,Max} for Kepler - -inline __device__ int atomic_fetch_min(volatile int* const dest, - const int val) { - return Impl::atomic_fetch_oper(Impl::MinOper(), dest, - val); -} - -inline __device__ unsigned int atomic_fetch_min( - volatile unsigned int* const dest, const unsigned int val) { - return Impl::atomic_fetch_oper( - Impl::MinOper(), dest, val); -} - -inline __device__ unsigned long long int atomic_fetch_min( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return Impl::atomic_fetch_oper(Impl::MinOper(), - dest, val); -} - -inline __device__ int atomic_fetch_max(volatile int* const dest, - const int val) { - return Impl::atomic_fetch_oper(Impl::MaxOper(), dest, - val); -} - -inline __device__ unsigned int atomic_fetch_max( - volatile unsigned int* const dest, const unsigned int val) { - return Impl::atomic_fetch_oper( - Impl::MaxOper(), dest, val); -} - -inline __device__ unsigned long long int atomic_fetch_max( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return Impl::atomic_fetch_oper(Impl::MaxOper(), - dest, val); -} - -#else // Supported by devices of compute capability 3.5 and higher - -inline __device__ int atomic_fetch_min(volatile int* const dest, - const int val) { - return atomicMin((int*)dest, val); -} - -inline __device__ unsigned int atomic_fetch_min( - volatile unsigned int* const dest, const unsigned int val) { - return atomicMin((unsigned int*)dest, val); -} - -inline __device__ unsigned long long int atomic_fetch_min( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return atomicMin((unsigned long long int*)dest, val); -} - -inline __device__ int atomic_fetch_max(volatile int* const dest, - const int val) { - return atomicMax((int*)dest, val); -} - -inline __device__ unsigned int atomic_fetch_max( - volatile unsigned int* const dest, const unsigned int val) { - return atomicMax((unsigned int*)dest, val); -} - -inline __device__ unsigned long long int atomic_fetch_max( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return atomicMax((unsigned long long int*)dest, val); -} - -#endif - -// Atomic_{min,max}_fetch - -#ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND - -// Host implementations for CLANG compiler - -inline __host__ int atomic_min_fetch(volatile int* const dest, const int val) { - return Impl::atomic_oper_fetch(Impl::MinOper(), dest, - val); -} - -inline __host__ unsigned int atomic_min_fetch(volatile unsigned int* const dest, - const unsigned int val) { - return Impl::atomic_oper_fetch( - Impl::MinOper(), dest, val); -} - -inline __host__ unsigned long long int atomic_min_fetch( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return Impl::atomic_oper_fetch(Impl::MinOper(), - dest, val); -} - -inline __host__ int atomic_max_fetch(volatile int* const dest, const int val) { - return Impl::atomic_oper_fetch(Impl::MaxOper(), dest, - val); -} - -inline __host__ unsigned int atomic_max_fetch(volatile unsigned int* const dest, - const unsigned int val) { - return Impl::atomic_oper_fetch( - Impl::MaxOper(), dest, val); -} - -inline __host__ unsigned long long int atomic_max_fetch( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return Impl::atomic_oper_fetch(Impl::MaxOper(), - dest, val); -} -#endif - -#if (350 > __CUDA_ARCH__) - -// Fallback for atomic{Min,Max} for Kepler - -inline __device__ int atomic_min_fetch(volatile int* const dest, - const int val) { - return Impl::atomic_oper_fetch(Impl::MinOper(), dest, - val); -} - -inline __device__ unsigned int atomic_min_fetch( - volatile unsigned int* const dest, const unsigned int val) { - return Impl::atomic_oper_fetch( - Impl::MinOper(), dest, val); -} - -inline __device__ unsigned long long int atomic_min_fetch( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return Impl::atomic_oper_fetch(Impl::MinOper(), - dest, val); -} - -inline __device__ int atomic_max_fetch(volatile int* const dest, - const int val) { - return Impl::atomic_oper_fetch(Impl::MaxOper(), dest, - val); -} - -inline __device__ unsigned int atomic_max_fetch( - volatile unsigned int* const dest, const unsigned int val) { - return Impl::atomic_oper_fetch( - Impl::MaxOper(), dest, val); -} - -inline __device__ unsigned long long int atomic_max_fetch( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - return Impl::atomic_oper_fetch(Impl::MaxOper(), - dest, val); -} - -#else // Supported by devices of compute capability 3.5 and higher - -inline __device__ int atomic_min_fetch(volatile int* const dest, - const int val) { - const int old = atomicMin((int*)dest, val); - return old < val ? old : val; -} - -inline __device__ unsigned int atomic_min_fetch( - volatile unsigned int* const dest, const unsigned int val) { - const unsigned int old = atomicMin((unsigned int*)dest, val); - return old < val ? old : val; -} - -inline __device__ unsigned long long int atomic_min_fetch( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - const unsigned long long old = atomicMin((unsigned long long*)dest, val); - return old < val ? old : val; -} - -inline __device__ int atomic_max_fetch(volatile int* const dest, - const int val) { - const int old = atomicMax((int*)dest, val); - return old >= val ? old : val; -} - -inline __device__ unsigned int atomic_max_fetch( - volatile unsigned int* const dest, const unsigned int val) { - const unsigned int old = atomicMax((unsigned int*)dest, val); - return old >= val ? old : val; -} - -inline __device__ unsigned long long int atomic_max_fetch( - volatile unsigned long long int* const dest, - const unsigned long long int val) { - const unsigned long long old = atomicMax((unsigned long long*)dest, val); - return old >= val ? old : val; -} - -#endif - -#endif -#endif -} // namespace Kokkos - -#endif diff --git a/core/src/impl/Kokkos_Atomic_Store.hpp b/core/src/impl/Kokkos_Atomic_Store.hpp deleted file mode 100644 index 3c82e0e3dd..0000000000 --- a/core/src/impl/Kokkos_Atomic_Store.hpp +++ /dev/null @@ -1,197 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_KOKKOS_ATOMIC_STORE_HPP -#define KOKKOS_IMPL_KOKKOS_ATOMIC_STORE_HPP - -#include -#if defined(KOKKOS_ATOMIC_HPP) - -#include -#include - -#if defined(KOKKOS_ENABLE_CUDA) -#include -#endif - -namespace Kokkos { -namespace Impl { - -// Olivier's implementation helpfully binds to the same builtins as GNU, so -// we make this code common across multiple options -#if (defined(KOKKOS_ENABLE_GNU_ATOMICS) && !defined(__CUDA_ARCH__)) || \ - (defined(KOKKOS_ENABLE_INTEL_ATOMICS) && !defined(__CUDA_ARCH__)) || \ - defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) - -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) -#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH __inline__ __device__ -#else -#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH inline -#endif - -template -KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH void _atomic_store( - T* ptr, T val, MemoryOrder, - std::enable_if_t<(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || - sizeof(T) == 8) && - std::is_same>::value, - void const**> = nullptr) { - __atomic_store_n(ptr, val, MemoryOrder::gnu_constant); -} - -template -KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH void _atomic_store( - T* ptr, T val, MemoryOrder, - std::enable_if_t::value && - std::is_same>::value, - void const**> = nullptr) { - __atomic_store(ptr, &val, MemoryOrder::gnu_constant); -} - -#undef KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH - -#elif defined(__CUDA_ARCH__) - -// Not compiling for Volta or later, or Cuda ASM atomics were manually disabled - -template -__device__ __inline__ void _relaxed_atomic_store_impl( - T* ptr, T val, - std::enable_if_t<(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || - sizeof(T) == 8), - void const**> = nullptr) { - *ptr = val; -} - -template -struct StoreOper { - __device__ __inline__ static constexpr T apply(T const&, - T const& val) noexcept { - return val; - } -}; - -template -__device__ __inline__ void _relaxed_atomic_store_impl( - T* ptr, T val, - std::enable_if_t = nullptr) { - Kokkos::Impl::atomic_oper_fetch(StoreOper{}, ptr, (T &&) val); -} - -template -__device__ __inline__ void _atomic_store(T* ptr, T val, - memory_order_seq_cst_t) { - Kokkos::memory_fence(); - Impl::_relaxed_atomic_store_impl(ptr, val); - Kokkos::memory_fence(); -} - -template -__device__ __inline__ void _atomic_store(T* ptr, T val, - memory_order_release_t) { - Kokkos::memory_fence(); - _relaxed_atomic_store_impl(ptr, val); -} - -template -__device__ __inline__ void _atomic_store(T* ptr, T val, - memory_order_relaxed_t) { - _relaxed_atomic_store_impl(ptr, val); -} - -#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) - -template -inline void _atomic_store(T* ptr, T val, MemoryOrder) { - // AFAICT, all OpenMP atomics are sequentially consistent, so memory order - // doesn't matter -#pragma omp atomic write - { *ptr = val; } -} - -#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) - -template -inline void _atomic_store(T* ptr, T val, MemoryOrder) { - *ptr = val; -} - -#elif defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) - -template -inline void _atomic_store(T* ptr, T val, MemoryOrder) { - atomic_exchange(ptr, val); -} - -#endif // end of all atomic implementations - -template -KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* ptr, T val, - Impl::memory_order_seq_cst_t) { - _atomic_store(ptr, val, Impl::memory_order_seq_cst); -} - -template -KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* ptr, T val, - Impl::memory_order_release_t) { - _atomic_store(ptr, val, Impl::memory_order_release); -} - -template -KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* ptr, T val, - Impl::memory_order_relaxed_t) { - _atomic_store(ptr, val, Impl::memory_order_relaxed); -} - -template -KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* /*ptr*/, T /*val*/, - Impl::memory_order_acquire_t) { - static_assert( - sizeof(T) == 0, // just something that will always be false, but only on - // instantiation - "atomic_store with memory order acquire doesn't make any sense!"); -} - -template -KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* /*ptr*/, T /*val*/, - Impl::memory_order_acq_rel_t) { - static_assert( - sizeof(T) == 0, // just something that will always be false, but only on - // instantiation - "atomic_store with memory order acq_rel doesn't make any sense!"); -} - -template -KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* ptr, T val) { - // relaxed by default! - _atomic_store(ptr, val, Impl::memory_order_relaxed); -} - -} // end namespace Impl -} // end namespace Kokkos - -#if defined(KOKKOS_ENABLE_CUDA) -#include -#endif - -#endif // defined(KOKKOS_ATOMIC_HPP) -#endif // KOKKOS_IMPL_KOKKOS_ATOMIC_STORE_HPP diff --git a/core/src/impl/Kokkos_Atomic_Windows.hpp b/core/src/impl/Kokkos_Atomic_Windows.hpp deleted file mode 100644 index ffb0d2bae2..0000000000 --- a/core/src/impl/Kokkos_Atomic_Windows.hpp +++ /dev/null @@ -1,127 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER -#ifndef KOKKOS_ATOMIC_WINDOWS_HPP -#define KOKKOS_ATOMIC_WINDOWS_HPP - -#ifdef _WIN32 - -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#include - -namespace Kokkos { -namespace Impl { -#ifdef _MSC_VER -_declspec(align(16)) -#endif - struct cas128_t { - LONGLONG lower; - LONGLONG upper; - KOKKOS_INLINE_FUNCTION - bool operator!=(const cas128_t& a) const { - return (lower != a.lower) || upper != a.upper; - } -} -#if defined(__GNUC__) || defined(__clang__) -__attribute__((aligned(16))) -#endif -; -} // namespace Impl - -#if !defined(__CUDA_ARCH__) || defined(__clang__) -template -inline T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t val) { - union U { - CHAR i; - T t; - KOKKOS_INLINE_FUNCTION U(){}; - } tmp; - - tmp.i = _InterlockedCompareExchange8((CHAR*)dest, *((CHAR*)&val), - *((CHAR*)&compare)); - return tmp.t; -} - -template -inline T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t val) { - union U { - SHORT i; - T t; - KOKKOS_INLINE_FUNCTION U(){}; - } tmp; - - tmp.i = _InterlockedCompareExchange16((SHORT*)dest, *((SHORT*)&val), - *((SHORT*)&compare)); - return tmp.t; -} - -template -inline T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t val) { - union U { - LONG i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } tmp; - - tmp.i = _InterlockedCompareExchange((LONG*)dest, *((LONG*)&val), - *((LONG*)&compare)); - return tmp.t; -} - -template -inline T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t val) { - union U { - LONGLONG i; - T t; - KOKKOS_INLINE_FUNCTION U() {} - } tmp; - - tmp.i = _InterlockedCompareExchange64((LONGLONG*)dest, *((LONGLONG*)&val), - *((LONGLONG*)&compare)); - return tmp.t; -} - -template -inline T atomic_compare_exchange( - volatile T* const dest, const T& compare, - std::enable_if_t val) { - T compare_and_result(compare); - union U { - Impl::cas128_t i; - T t; - KOKKOS_INLINE_FUNCTION U(){}; - } newval; - newval.t = val; - _InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper, - newval.i.lower, - ((LONGLONG*)&compare_and_result)); - return compare_and_result; -} -#endif - -} // namespace Kokkos -#endif -#endif From 7f5ea60095120aa02e1f348d09adfdd98511d8c6 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 12:59:52 -0500 Subject: [PATCH 092/496] Update diff_files (might be worth revisiting logic) --- scripts/diff_files | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/diff_files b/scripts/diff_files index 125568d345..8b13789179 100644 --- a/scripts/diff_files +++ b/scripts/diff_files @@ -1,2 +1 @@ -core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp From 796e964880159e8660b975bde4dfc793dd58e19a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 13:09:20 -0500 Subject: [PATCH 093/496] Drop `KOKKOS_ENABLE_IMPL_DESUL_ATOMICS` macro define altogether --- Makefile.kokkos | 1 - cmake/KokkosCore_config.h.in | 1 - cmake/kokkos_enable_options.cmake | 1 - 3 files changed, 3 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 0be0dc564a..c0ca398570 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1178,7 +1178,6 @@ endif ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1) $(warning enable_desul_atomics option has been removed. Desul atomics are always enabled.) endif -tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS // deprecated") KOKKOS_CPPFLAGS+=-I$(KOKKOS_PATH)/tpls/desul/include ifeq ($(KOKKOS_INTERNAL_DISABLE_BUNDLED_MDSPAN), 0) diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index beddd4c4a1..e1e3ac6cd2 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -50,7 +50,6 @@ #cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS #cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN -#cmakedefine KOKKOS_ENABLE_IMPL_DESUL_ATOMICS // deprecated #cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION // deprecated #cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION #cmakedefine KOKKOS_ENABLE_IMPL_MDSPAN diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index 517b9aaca5..478821c525 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -166,4 +166,3 @@ ENDIF() IF(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) MESSAGE(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") ENDIF() -set(KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ON) From d745c31735f983d8a40c0729c69f0f08a4467725 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 16:03:11 -0500 Subject: [PATCH 094/496] Fixup deleted wrong branch in HIP locks Co-Authored-By: Daniel Arndt --- core/src/HIP/Kokkos_HIP_Locks.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/HIP/Kokkos_HIP_Locks.hpp b/core/src/HIP/Kokkos_HIP_Locks.hpp index 0ddd1c486d..e2ea06c11f 100644 --- a/core/src/HIP/Kokkos_HIP_Locks.hpp +++ b/core/src/HIP/Kokkos_HIP_Locks.hpp @@ -132,6 +132,7 @@ inline void ensure_hip_lock_arrays_on_device() {} #else inline static void ensure_hip_lock_arrays_on_device() { copy_hip_lock_arrays_to_device(); + desul::ensure_hip_lock_arrays_on_device(); } #endif From 7869915c0896fd5331115af3c413d0f0dc40dd15 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 18:01:18 -0500 Subject: [PATCH 095/496] Move Kokkos::{Impl:: -> }::ALL_t definition and add using-declaration in Impl:: namespace for backward compatibility --- core/src/impl/Kokkos_ViewMapping.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 3ab8237cd1..b1c9e375b7 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -286,7 +286,6 @@ struct ViewDimensionAssignable, //---------------------------------------------------------------------------- namespace Kokkos { -namespace Impl { struct ALL_t { KOKKOS_INLINE_FUNCTION @@ -296,6 +295,8 @@ struct ALL_t { constexpr bool operator==(const ALL_t&) const { return true; } }; +namespace Impl { +using Kokkos::ALL_t; } // namespace Impl } // namespace Kokkos From 5304a40801b2a083031a9efb5509332ac8fe0e2a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 18:04:28 -0500 Subject: [PATCH 096/496] Stay off Kokkos::Impl::ALL_t --- containers/src/Kokkos_OffsetView.hpp | 3 +- containers/src/Kokkos_ScatterView.hpp | 2 +- core/src/Kokkos_CopyViews.hpp | 69 ++-- core/src/Kokkos_View.hpp | 2 +- core/src/impl/Kokkos_ViewMapping.hpp | 15 +- core/unit_test/TestViewSubview.hpp | 570 +++++++++++++------------- 6 files changed, 320 insertions(+), 341 deletions(-) diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 35b28999c1..39de9ebbab 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -1251,8 +1251,7 @@ shift_input(const T arg, const int64_t offset) { } KOKKOS_INLINE_FUNCTION -Kokkos::Impl::ALL_t shift_input(const Kokkos::Impl::ALL_t arg, - const int64_t /*offset*/) { +Kokkos::ALL_t shift_input(const Kokkos::ALL_t arg, const int64_t /*offset*/) { return arg; } diff --git a/containers/src/Kokkos_ScatterView.hpp b/containers/src/Kokkos_ScatterView.hpp index 3b30996ad6..dbcab7c7e5 100644 --- a/containers/src/Kokkos_ScatterView.hpp +++ b/containers/src/Kokkos_ScatterView.hpp @@ -532,7 +532,7 @@ void args_to_array(size_t* array, int pos, T dim0, Dims... dims) { subview where the index specified is the largest-stride one. */ template struct Slice { - using next = Slice; + using next = Slice; using value_type = typename next::value_type; static value_type get(V const& src, const size_t i, Args... args) { diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 6d5d9548c7..e02cbee589 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -874,7 +874,7 @@ struct ViewRemap { } else { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1); view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub); @@ -883,7 +883,7 @@ struct ViewRemap { if (dst.extent(1) == src.extent(1)) { p_type ext0(0, std::min(dst.extent(0), src.extent(0))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, ext0, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub); @@ -915,8 +915,7 @@ struct ViewRemap { if (dst.extent(2) == src.extent(2)) { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -925,8 +924,7 @@ struct ViewRemap { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); p_type ext2(0, std::min(dst.extent(2), src.extent(2))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2); view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub); @@ -935,8 +933,8 @@ struct ViewRemap { if (dst.extent(2) == src.extent(2)) { p_type ext0(0, std::min(dst.extent(0), src.extent(0))); p_type ext1(0, std::min(dst.extent(1), src.extent(1))); - using sv_adapter_type = CommonSubview; + using sv_adapter_type = + CommonSubview; sv_adapter_type common_subview(dst, src, ext0, ext1, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub); @@ -970,8 +968,7 @@ struct ViewRemap { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); p_type ext2(0, std::min(dst.extent(2), src.extent(2))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -981,8 +978,7 @@ struct ViewRemap { p_type ext2(0, std::min(dst.extent(2), src.extent(2))); p_type ext3(0, std::min(dst.extent(3), src.extent(3))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3); view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub); @@ -993,8 +989,7 @@ struct ViewRemap { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); p_type ext2(0, std::min(dst.extent(2), src.extent(2))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub); @@ -1029,9 +1024,8 @@ struct ViewRemap { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); p_type ext2(0, std::min(dst.extent(2), src.extent(2))); p_type ext3(0, std::min(dst.extent(3), src.extent(3))); - using sv_adapter_type = - CommonSubview; + using sv_adapter_type = CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1041,9 +1035,8 @@ struct ViewRemap { p_type ext2(0, std::min(dst.extent(2), src.extent(2))); p_type ext3(0, std::min(dst.extent(3), src.extent(3))); p_type ext4(0, std::min(dst.extent(4), src.extent(4))); - using sv_adapter_type = - CommonSubview; + using sv_adapter_type = CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4); view_copy(exec_space..., common_subview.dst_sub, @@ -1055,9 +1048,8 @@ struct ViewRemap { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); p_type ext2(0, std::min(dst.extent(2), src.extent(2))); p_type ext3(0, std::min(dst.extent(3), src.extent(3))); - using sv_adapter_type = - CommonSubview; + using sv_adapter_type = CommonSubview; sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1095,8 +1087,8 @@ struct ViewRemap { p_type ext3(0, std::min(dst.extent(3), src.extent(3))); p_type ext4(0, std::min(dst.extent(4), src.extent(4))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1108,8 +1100,8 @@ struct ViewRemap { p_type ext4(0, std::min(dst.extent(4), src.extent(4))); p_type ext5(0, std::min(dst.extent(5), src.extent(5))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4, ext5); view_copy(exec_space..., common_subview.dst_sub, @@ -1125,7 +1117,7 @@ struct ViewRemap { using sv_adapter_type = CommonSubview; + p_type, ALL_t>; sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1169,8 +1161,8 @@ struct ViewRemap { p_type ext4(0, std::min(dst.extent(4), src.extent(4))); p_type ext5(0, std::min(dst.extent(5), src.extent(5))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4, ext5, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1183,8 +1175,8 @@ struct ViewRemap { p_type ext5(0, std::min(dst.extent(5), src.extent(5))); p_type ext6(0, std::min(dst.extent(6), src.extent(6))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4, ext5, ext6); view_copy(exec_space..., common_subview.dst_sub, @@ -1200,7 +1192,7 @@ struct ViewRemap { p_type ext5(0, std::min(dst.extent(5), src.extent(5))); using sv_adapter_type = CommonSubview; + p_type, p_type, ALL_t>; sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4, ext5, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1245,9 +1237,8 @@ struct ViewRemap { p_type ext5(0, std::min(dst.extent(5), src.extent(5))); p_type ext6(0, std::min(dst.extent(6), src.extent(6))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4, ext5, ext6, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1261,8 +1252,8 @@ struct ViewRemap { p_type ext6(0, std::min(dst.extent(6), src.extent(6))); p_type ext7(0, std::min(dst.extent(7), src.extent(7))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4, ext5, ext6, ext7); view_copy(exec_space..., common_subview.dst_sub, @@ -1279,7 +1270,7 @@ struct ViewRemap { p_type ext6(0, std::min(dst.extent(6), src.extent(6))); using sv_adapter_type = CommonSubview; + p_type, p_type, p_type, ALL_t>; sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4, ext5, ext6, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 85957ba8fa..7563a820a4 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -496,7 +496,7 @@ namespace Kokkos { namespace { -constexpr Kokkos::Impl::ALL_t ALL = Kokkos::Impl::ALL_t(); +constexpr Kokkos::ALL_t ALL = Kokkos::ALL_t(); constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing = Kokkos::Impl::WithoutInitializing_t(); diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index b1c9e375b7..b395583f54 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -305,7 +305,7 @@ namespace Impl { template struct is_integral_extent_type { - enum : bool { value = std::is_same::value ? 1 : 0 }; + enum : bool { value = std::is_same::value ? 1 : 0 }; }; template @@ -354,8 +354,7 @@ struct SubviewLegalArgsCompileTime::value)) || ((CurrentArg >= RankDest) && (std::is_integral::value)) || - ((CurrentArg < RankDest) && - (std::is_same::value)) || + ((CurrentArg < RankDest) && (std::is_same::value)) || ((CurrentArg == 0) && (Kokkos::Impl::is_integral_extent_type::value))) && (SubviewLegalArgsCompileTime::value)) || ((CurrentArg >= RankSrc - RankDest) && - (std::is_same::value))) && + (std::is_same::value))) && (SubviewLegalArgsCompileTime::value) @@ -397,8 +396,7 @@ template struct SubviewLegalArgsCompileTime { enum { - value = ((CurrentArg == RankSrc - 1) && - (std::is_same::value)) + value = ((CurrentArg == RankSrc - 1) && (std::is_same::value)) }; }; @@ -464,8 +462,7 @@ struct SubviewExtents { KOKKOS_FORCEINLINE_FUNCTION bool set(unsigned domain_rank, unsigned range_rank, const ViewDimension& dim, - const Kokkos::Impl::ALL_t, - Args... args) { + const ALL_t, Args... args) { m_begin[domain_rank] = 0; m_length[range_rank] = dim.extent(domain_rank); m_index[range_rank] = domain_rank; @@ -560,7 +557,7 @@ struct SubviewExtents { // std::pair range template void error(char* buf, int buf_len, unsigned domain_rank, unsigned range_rank, - const ViewDimension& dim, const Kokkos::Impl::ALL_t, + const ViewDimension& dim, const ALL_t, Args... args) const { const int n = std::min(buf_len, snprintf(buf, buf_len, " Kokkos::ALL %c", int(sizeof...(Args) ? ',' : ')'))); diff --git a/core/unit_test/TestViewSubview.hpp b/core/unit_test/TestViewSubview.hpp index f1cf9e4bab..f33b5611bf 100644 --- a/core/unit_test/TestViewSubview.hpp +++ b/core/unit_test/TestViewSubview.hpp @@ -1471,30 +1471,30 @@ void test_3d_subview_5d_impl_layout() { } inline void test_subview_legal_args_right() { + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t, + Kokkos::ALL_t, Kokkos::pair, int, int>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t, + Kokkos::ALL_t, Kokkos::ALL_t, int, int>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t, + Kokkos::pair, Kokkos::pair, int, int>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t, + Kokkos::pair, Kokkos::ALL_t, int, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, - Kokkos::pair, int, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, int, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::Impl::ALL_t, Kokkos::pair, - Kokkos::pair, int, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::Impl::ALL_t, Kokkos::pair, - Kokkos::Impl::ALL_t, int, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, + Kokkos::pair, Kokkos::ALL_t, Kokkos::pair, int, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, int, int>::value)); + Kokkos::pair, Kokkos::ALL_t, Kokkos::ALL_t, int, + int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::pair, @@ -1502,98 +1502,101 @@ inline void test_subview_legal_args_right() { ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::pair, - Kokkos::Impl::ALL_t, int, int>::value)); + Kokkos::ALL_t, int, int>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t, + int, Kokkos::ALL_t, Kokkos::pair, int>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t, + int, Kokkos::ALL_t, Kokkos::ALL_t, int>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t, + int, Kokkos::pair, Kokkos::pair, int>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t, + int, Kokkos::pair, Kokkos::ALL_t, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, - Kokkos::pair, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::Impl::ALL_t, int, Kokkos::pair, - Kokkos::pair, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::Impl::ALL_t, int, Kokkos::pair, - Kokkos::Impl::ALL_t, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::pair, int, Kokkos::Impl::ALL_t, + Kokkos::pair, int, Kokkos::ALL_t, Kokkos::pair, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::pair, int, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, int>::value)); + Kokkos::pair, int, Kokkos::ALL_t, Kokkos::ALL_t, + int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, int, Kokkos::pair, Kokkos::pair, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::pair, int, Kokkos::Impl::ALL_t, + Kokkos::pair, int, Kokkos::ALL_t, Kokkos::pair, int>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t, + Kokkos::ALL_t, int, Kokkos::pair, int>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t, + Kokkos::ALL_t, int, Kokkos::ALL_t, int>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t, + Kokkos::pair, int, Kokkos::pair, int>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t, + Kokkos::pair, int, Kokkos::ALL_t, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, - Kokkos::pair, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, - Kokkos::Impl::ALL_t, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::Impl::ALL_t, Kokkos::pair, int, - Kokkos::pair, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::Impl::ALL_t, Kokkos::pair, int, - Kokkos::Impl::ALL_t, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, int, + Kokkos::pair, Kokkos::ALL_t, int, Kokkos::pair, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, int, - Kokkos::Impl::ALL_t, int>::value)); + Kokkos::pair, Kokkos::ALL_t, int, Kokkos::ALL_t, + int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair, Kokkos::pair, int, Kokkos::pair, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, int, + Kokkos::pair, Kokkos::ALL_t, int, Kokkos::pair, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::ALL_t, + Kokkos::ALL_t, Kokkos::pair, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, - Kokkos::pair, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::pair, - Kokkos::pair, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::pair, - Kokkos::Impl::ALL_t, int>::value)); + Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::ALL_t, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::ALL_t, + Kokkos::pair, Kokkos::pair, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::ALL_t, + Kokkos::pair, Kokkos::ALL_t, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, - Kokkos::pair, Kokkos::Impl::ALL_t, + Kokkos::pair, Kokkos::ALL_t, Kokkos::pair, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, - Kokkos::pair, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::pair, Kokkos::ALL_t, Kokkos::ALL_t, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, @@ -1601,32 +1604,35 @@ inline void test_subview_legal_args_right() { ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, - Kokkos::Impl::ALL_t, int>::value)); + Kokkos::ALL_t, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::ALL_t, + Kokkos::ALL_t, int, Kokkos::pair>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, - Kokkos::pair>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, - Kokkos::Impl::ALL_t>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::pair, int, - Kokkos::pair>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::pair, int, - Kokkos::Impl::ALL_t>::value)); + Kokkos::ALL_t, Kokkos::ALL_t, int, Kokkos::ALL_t>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::ALL_t, + Kokkos::pair, int, Kokkos::pair>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::ALL_t, + Kokkos::pair, int, Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, - Kokkos::pair, Kokkos::Impl::ALL_t, int, + Kokkos::pair, Kokkos::ALL_t, int, Kokkos::pair>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, - Kokkos::pair, Kokkos::Impl::ALL_t, int, - Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::pair, Kokkos::ALL_t, int, Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, int, @@ -1634,32 +1640,31 @@ inline void test_subview_legal_args_right() { ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, int, - Kokkos::Impl::ALL_t>::value)); + Kokkos::ALL_t>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, - Kokkos::pair>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, + Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::pair>::value)); ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t>::value)); + Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, - Kokkos::Impl::ALL_t, Kokkos::pair, + Kokkos::ALL_t, Kokkos::pair, Kokkos::pair>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, + Kokkos::ALL_t, Kokkos::pair, Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, - Kokkos::Impl::ALL_t, Kokkos::pair, - Kokkos::Impl::ALL_t>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, - Kokkos::pair, Kokkos::Impl::ALL_t, + Kokkos::pair, Kokkos::ALL_t, Kokkos::pair>::value)); - ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, - Kokkos::pair, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(1, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, + Kokkos::pair, Kokkos::ALL_t, Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair, Kokkos::pair, @@ -1667,36 +1672,35 @@ inline void test_subview_legal_args_right() { ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair, Kokkos::pair, - Kokkos::Impl::ALL_t>::value)); + Kokkos::ALL_t>::value)); ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, - Kokkos::pair>::value)); - ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, - Kokkos::pair>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, - Kokkos::Impl::ALL_t, Kokkos::pair, - Kokkos::Impl::ALL_t>::value)); + Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::ALL_t>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::ALL_t, + Kokkos::ALL_t, Kokkos::pair>::value)); + ASSERT_EQ(1, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, + Kokkos::pair, Kokkos::ALL_t, Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, - Kokkos::Impl::ALL_t, Kokkos::pair, + Kokkos::pair, Kokkos::ALL_t, Kokkos::pair>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::ALL_t, + Kokkos::pair, Kokkos::ALL_t>::value)); + ASSERT_EQ( + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::ALL_t, + Kokkos::pair, Kokkos::pair>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair, Kokkos::pair, - Kokkos::Impl::ALL_t>::value)); + Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair, Kokkos::pair, @@ -1704,34 +1708,30 @@ inline void test_subview_legal_args_right() { } inline void test_subview_legal_args_left() { + ASSERT_EQ(1, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t, + Kokkos::ALL_t, Kokkos::pair, int, int>::value)); + ASSERT_EQ(1, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t, + Kokkos::ALL_t, Kokkos::ALL_t, int, int>::value)); ASSERT_EQ( - 1, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, Kokkos::pair, int, int>::value)); - ASSERT_EQ( - 1, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int>::value)); - ASSERT_EQ( - 0, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, - Kokkos::pair, Kokkos::pair, int, int>::value)); - ASSERT_EQ( - 0, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, - Kokkos::pair, Kokkos::Impl::ALL_t, int, int>::value)); + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t, + Kokkos::pair, Kokkos::pair, int, int>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t, + Kokkos::pair, Kokkos::ALL_t, int, int>::value)); ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, + Kokkos::pair, Kokkos::ALL_t, Kokkos::pair, int, int>::value)); ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, int, int>::value)); + Kokkos::pair, Kokkos::ALL_t, Kokkos::ALL_t, int, + int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::pair, @@ -1739,106 +1739,101 @@ inline void test_subview_legal_args_left() { ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::pair, - Kokkos::Impl::ALL_t, int, int>::value)); - - ASSERT_EQ( - 0, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, - int, Kokkos::Impl::ALL_t, Kokkos::pair, int>::value)); - ASSERT_EQ( - 0, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, - int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int>::value)); - ASSERT_EQ( - 0, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, - int, Kokkos::pair, Kokkos::pair, int>::value)); + Kokkos::ALL_t, int, int>::value)); + + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t, + int, Kokkos::ALL_t, Kokkos::pair, int>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t, + int, Kokkos::ALL_t, Kokkos::ALL_t, int>::value)); ASSERT_EQ( - 0, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, - int, Kokkos::pair, Kokkos::Impl::ALL_t, int>::value)); + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t, + int, Kokkos::pair, Kokkos::pair, int>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t, + int, Kokkos::pair, Kokkos::ALL_t, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, - Kokkos::pair, int, Kokkos::Impl::ALL_t, + Kokkos::pair, int, Kokkos::ALL_t, Kokkos::pair, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, - Kokkos::pair, int, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, int>::value)); + Kokkos::pair, int, Kokkos::ALL_t, Kokkos::ALL_t, + int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, int, Kokkos::pair, Kokkos::pair, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, - Kokkos::pair, int, Kokkos::Impl::ALL_t, + Kokkos::pair, int, Kokkos::ALL_t, Kokkos::pair, int>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t, + Kokkos::ALL_t, int, Kokkos::pair, int>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t, + Kokkos::ALL_t, int, Kokkos::ALL_t, int>::value)); ASSERT_EQ( - 0, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, int, Kokkos::pair, int>::value)); - ASSERT_EQ( - 0, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int>::value)); - ASSERT_EQ( - 0, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, - Kokkos::pair, int, Kokkos::pair, int>::value)); - ASSERT_EQ( - 0, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, - Kokkos::pair, int, Kokkos::Impl::ALL_t, int>::value)); + 0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t, + Kokkos::pair, int, Kokkos::pair, int>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t, + Kokkos::pair, int, Kokkos::ALL_t, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, int, + Kokkos::pair, Kokkos::ALL_t, int, Kokkos::pair, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, int, - Kokkos::Impl::ALL_t, int>::value)); + Kokkos::pair, Kokkos::ALL_t, int, Kokkos::ALL_t, + int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair, Kokkos::pair, int, Kokkos::pair, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, int, + Kokkos::pair, Kokkos::ALL_t, int, Kokkos::pair, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::ALL_t, + Kokkos::ALL_t, Kokkos::pair, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, - Kokkos::pair, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::pair, - Kokkos::pair, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::pair, - Kokkos::Impl::ALL_t, int>::value)); + Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::ALL_t, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::ALL_t, + Kokkos::pair, Kokkos::pair, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::ALL_t, + Kokkos::pair, Kokkos::ALL_t, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, - Kokkos::pair, Kokkos::Impl::ALL_t, + Kokkos::pair, Kokkos::ALL_t, Kokkos::pair, int>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, - Kokkos::pair, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::pair, Kokkos::ALL_t, Kokkos::ALL_t, int>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, @@ -1846,32 +1841,35 @@ inline void test_subview_legal_args_left() { ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, - Kokkos::Impl::ALL_t, int>::value)); + Kokkos::ALL_t, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::ALL_t, + Kokkos::ALL_t, int, Kokkos::pair>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, - Kokkos::pair>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, - Kokkos::Impl::ALL_t>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::pair, int, - Kokkos::pair>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, - Kokkos::Impl::ALL_t, Kokkos::pair, int, - Kokkos::Impl::ALL_t>::value)); + Kokkos::ALL_t, Kokkos::ALL_t, int, Kokkos::ALL_t>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::ALL_t, + Kokkos::pair, int, Kokkos::pair>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::ALL_t, + Kokkos::pair, int, Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, - Kokkos::pair, Kokkos::Impl::ALL_t, int, + Kokkos::pair, Kokkos::ALL_t, int, Kokkos::pair>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, - Kokkos::pair, Kokkos::Impl::ALL_t, int, - Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::pair, Kokkos::ALL_t, int, Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, int, @@ -1879,32 +1877,31 @@ inline void test_subview_legal_args_left() { ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair, Kokkos::pair, int, - Kokkos::Impl::ALL_t>::value)); + Kokkos::ALL_t>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, + Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::pair>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, - Kokkos::pair>::value)); + Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, - Kokkos::Impl::ALL_t, Kokkos::pair, + Kokkos::ALL_t, Kokkos::pair, Kokkos::pair>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, + Kokkos::ALL_t, Kokkos::pair, Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, - Kokkos::Impl::ALL_t, Kokkos::pair, - Kokkos::Impl::ALL_t>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, - Kokkos::pair, Kokkos::Impl::ALL_t, + Kokkos::pair, Kokkos::ALL_t, Kokkos::pair>::value)); - ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, - Kokkos::pair, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, + Kokkos::pair, Kokkos::ALL_t, Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair, Kokkos::pair, @@ -1912,40 +1909,35 @@ inline void test_subview_legal_args_left() { ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair, Kokkos::pair, - Kokkos::Impl::ALL_t>::value)); + Kokkos::ALL_t>::value)); - ASSERT_EQ( - 1, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, Kokkos::pair>::value)); - ASSERT_EQ( - 1, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(1, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::ALL_t, + Kokkos::ALL_t, Kokkos::pair>::value)); ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, - Kokkos::pair>::value)); + Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::ALL_t>::value)); ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, - Kokkos::pair, Kokkos::Impl::ALL_t, - Kokkos::Impl::ALL_t>::value)); - ASSERT_EQ( - 0, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, - Kokkos::pair, Kokkos::Impl::ALL_t>::value)); - ASSERT_EQ( - 0, - (Kokkos::Impl::SubviewLegalArgsCompileTime< - Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, - Kokkos::pair, Kokkos::pair>::value)); + Kokkos::pair, Kokkos::ALL_t, + Kokkos::pair>::value)); + ASSERT_EQ(1, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, + Kokkos::pair, Kokkos::ALL_t, Kokkos::ALL_t>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::ALL_t, + Kokkos::pair, Kokkos::ALL_t>::value)); + ASSERT_EQ(0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::ALL_t, + Kokkos::pair, Kokkos::pair>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair, Kokkos::pair, - Kokkos::Impl::ALL_t>::value)); + Kokkos::ALL_t>::value)); ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair, Kokkos::pair, From e91f7e880916b4f217f32450f17ac966f835e4a4 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 18:05:05 -0500 Subject: [PATCH 097/496] Guard using-declaration in Impl:: namespace with #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 --- core/src/impl/Kokkos_ViewMapping.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index b395583f54..947aaa53d5 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -295,9 +295,11 @@ struct ALL_t { constexpr bool operator==(const ALL_t&) const { return true; } }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 namespace Impl { using Kokkos::ALL_t; } // namespace Impl +#endif } // namespace Kokkos namespace Kokkos { From 4519e4ca891259d6e0a4d39b5838cadee4a78bca Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 24 Jan 2023 18:07:43 -0500 Subject: [PATCH 098/496] Drop anonymous namespace around definitions of ALL, WithoutInitializing, and AllowPadding --- core/src/Kokkos_View.hpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 7563a820a4..fb03c10e48 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -494,17 +494,11 @@ constexpr bool is_assignable(const Kokkos::View& dst, namespace Kokkos { -namespace { +inline constexpr Kokkos::ALL_t ALL{}; -constexpr Kokkos::ALL_t ALL = Kokkos::ALL_t(); +inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; -constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing = - Kokkos::Impl::WithoutInitializing_t(); - -constexpr Kokkos::Impl::AllowPadding_t AllowPadding = - Kokkos::Impl::AllowPadding_t(); - -} // namespace +inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; /** \brief Create View allocation parameter bundle from argument list. * From 05f6a9aab37d9d6b022a3ae2c2c21a5b25affd0d Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 25 Jan 2023 08:56:41 -0500 Subject: [PATCH 099/496] Per review dropped superfluous const-qualifiers --- core/src/impl/Kokkos_ViewMapping.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 947aaa53d5..fd02c95fd7 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -464,7 +464,7 @@ struct SubviewExtents { KOKKOS_FORCEINLINE_FUNCTION bool set(unsigned domain_rank, unsigned range_rank, const ViewDimension& dim, - const ALL_t, Args... args) { + ALL_t, Args... args) { m_begin[domain_rank] = 0; m_length[range_rank] = dim.extent(domain_rank); m_index[range_rank] = domain_rank; @@ -559,8 +559,7 @@ struct SubviewExtents { // std::pair range template void error(char* buf, int buf_len, unsigned domain_rank, unsigned range_rank, - const ViewDimension& dim, const ALL_t, - Args... args) const { + const ViewDimension& dim, ALL_t, Args... args) const { const int n = std::min(buf_len, snprintf(buf, buf_len, " Kokkos::ALL %c", int(sizeof...(Args) ? ',' : ')'))); From 6d90db37bc5b0f18dee459ebdb1f833f044093c8 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 25 Jan 2023 09:48:18 -0500 Subject: [PATCH 100/496] Move all SYCL headers into SYCL directory --- core/src/SYCL/Kokkos_SYCL.cpp | 2 +- core/src/{ => SYCL}/Kokkos_SYCL.hpp | 2 +- core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp | 2 +- core/src/SYCL/Kokkos_SYCL_Space.cpp | 4 ++-- core/src/{ => SYCL}/Kokkos_SYCL_Space.hpp | 0 core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp | 2 +- core/src/decl/Kokkos_Declare_SYCL.hpp | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) rename core/src/{ => SYCL}/Kokkos_SYCL.hpp (99%) rename core/src/{ => SYCL}/Kokkos_SYCL_Space.hpp (100%) diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index e38b011c89..d786385454 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include diff --git a/core/src/Kokkos_SYCL.hpp b/core/src/SYCL/Kokkos_SYCL.hpp similarity index 99% rename from core/src/Kokkos_SYCL.hpp rename to core/src/SYCL/Kokkos_SYCL.hpp index 0f8e744eb6..e87e6bd0cf 100644 --- a/core/src/Kokkos_SYCL.hpp +++ b/core/src/SYCL/Kokkos_SYCL.hpp @@ -31,7 +31,7 @@ static_assert(false, #else #include #endif -#include +#include #include #include #include diff --git a/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp index 62b7977fcc..c8285584b3 100644 --- a/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp +++ b/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp @@ -18,7 +18,7 @@ #define KOKKOS_SYCLDEEPCOPY_HPP #include -#include +#include #include diff --git a/core/src/SYCL/Kokkos_SYCL_Space.cpp b/core/src/SYCL/Kokkos_SYCL_Space.cpp index 6151b3eaaf..50ee3a3e11 100644 --- a/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -21,8 +21,8 @@ #include #include -#include -#include +#include +#include #include #include #include diff --git a/core/src/Kokkos_SYCL_Space.hpp b/core/src/SYCL/Kokkos_SYCL_Space.hpp similarity index 100% rename from core/src/Kokkos_SYCL_Space.hpp rename to core/src/SYCL/Kokkos_SYCL_Space.hpp diff --git a/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp index 2f0a67b3dd..efd3f3bf4d 100644 --- a/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp +++ b/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp @@ -18,7 +18,7 @@ #define KOKKOS_SYCL_UNIQUE_TOKEN_HPP #include -#include +#include #include namespace Kokkos { diff --git a/core/src/decl/Kokkos_Declare_SYCL.hpp b/core/src/decl/Kokkos_Declare_SYCL.hpp index 0c8dddbeb3..aa884c1065 100644 --- a/core/src/decl/Kokkos_Declare_SYCL.hpp +++ b/core/src/decl/Kokkos_Declare_SYCL.hpp @@ -18,7 +18,7 @@ #define KOKKOS_DECLARE_SYCL_HPP #if defined(KOKKOS_ENABLE_SYCL) -#include +#include #include #include #include From 236e892a2884f0b2b7431d5ff680f9af9de57746 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 25 Jan 2023 09:57:48 -0500 Subject: [PATCH 101/496] Fixup GH Actions compiler warnings (#5780) * Enable compiler warnings in GH Actions builds * Disable warning about Intel C++ Compiler Classic being deprecated in GH Actions * deal with ICC warnings in GitHub CI * deal with icpx warnings in GitHub CI * Clean up continuous-integration-workflow.yml * Use -Werror in GitHub CI Co-authored-by: Daniel Arndt --- .github/workflows/continuous-integration-workflow.yml | 9 ++++----- core/perf_test/BenchmarkMain.cpp | 8 ++++++++ core/perf_test/Benchmark_Context.hpp | 8 ++++++++ core/perf_test/PerfTest_ViewCopy.hpp | 10 ++++++++++ core/src/Kokkos_Macros.hpp | 4 ++-- 5 files changed, 32 insertions(+), 7 deletions(-) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 3f7c22afd0..0715911f31 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -19,26 +19,24 @@ jobs: include: - distro: 'fedora:intel' cxx: 'icpc' + cxx_extra_flags: '-diag-disable=177,10441' cmake_build_type: 'Release' backend: 'OPENMP' - clang-tidy: '' - distro: 'fedora:intel' cxx: 'icpc' + cxx_extra_flags: '-diag-disable=177,10441' cmake_build_type: 'Debug' backend: 'OPENMP' - clang-tidy: '' - distro: 'fedora:intel' cxx: 'icpx' cxx_extra_flags: '-fp-model=precise' cmake_build_type: 'Release' backend: 'OPENMP' - clang-tidy: '' - distro: 'fedora:intel' cxx: 'icpx' cxx_extra_flags: '-fp-model=precise' cmake_build_type: 'Debug' backend: 'OPENMP' - clang-tidy: '' - distro: 'ubuntu:latest' cxx: 'clang++' cmake_build_type: 'RelWithDebInfo' @@ -101,7 +99,8 @@ jobs: -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DCMAKE_CXX_FLAGS="${{ matrix.cxx_extra_flags }}" \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DCMAKE_CXX_FLAGS="-Werror ${{ matrix.cxx_extra_flags }}" \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} diff --git a/core/perf_test/BenchmarkMain.cpp b/core/perf_test/BenchmarkMain.cpp index c28eb80e1b..ac252b23a7 100644 --- a/core/perf_test/BenchmarkMain.cpp +++ b/core/perf_test/BenchmarkMain.cpp @@ -14,7 +14,15 @@ // //@HEADER +// Avoid deprecation warning for ICC +#ifdef __INTEL_COMPILER +#pragma warning(push) +#pragma warning(disable : 1786) #include +#pragma warning(pop) +#else +#include +#endif #include "Benchmark_Context.hpp" #include diff --git a/core/perf_test/Benchmark_Context.hpp b/core/perf_test/Benchmark_Context.hpp index ba6eff6646..e6c086561a 100644 --- a/core/perf_test/Benchmark_Context.hpp +++ b/core/perf_test/Benchmark_Context.hpp @@ -19,7 +19,15 @@ #include +// Avoid deprecation warning for ICC +#ifdef __INTEL_COMPILER +#pragma warning(push) +#pragma warning(disable : 1786) #include +#pragma warning(pop) +#else +#include +#endif #include #include diff --git a/core/perf_test/PerfTest_ViewCopy.hpp b/core/perf_test/PerfTest_ViewCopy.hpp index b0216ca6fc..573237a447 100644 --- a/core/perf_test/PerfTest_ViewCopy.hpp +++ b/core/perf_test/PerfTest_ViewCopy.hpp @@ -18,7 +18,17 @@ #define KOKKOS_CORE_PERFTEST_BENCHMARK_VIEW_COPY_HPP #include + +// Avoid deprecation warning for ICC +#ifdef __INTEL_COMPILER +#pragma warning(push) +#pragma warning(disable : 1786) +#include +#pragma warning(pop) +#else #include +#endif + #include namespace Test { diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index f251c2dab9..c1bd64b652 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -183,8 +183,8 @@ // Intel compiler macros #if defined(KOKKOS_COMPILER_INTEL) -// FIXME_SYCL -#if !defined(KOKKOS_ENABLE_SYCL) +// FIXME_ICPX +#if !defined(__INTEL_LLVM_COMPILER) #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 From ddefe61809380bc8205018a72ec5d16446e2f32d Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 25 Jan 2023 10:08:33 -0500 Subject: [PATCH 102/496] Issue warnings when using Kokkos::Impl::ALL_t Co-authored-by: Daniel Arndt --- core/src/impl/Kokkos_ViewMapping.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index fd02c95fd7..be67f64f09 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -297,7 +297,8 @@ struct ALL_t { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 namespace Impl { -using Kokkos::ALL_t; +using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = + Kokkos::ALL_t; } // namespace Impl #endif } // namespace Kokkos From adb3141e8d99dcc6449d9b8bab143662d761265c Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 25 Jan 2023 12:30:14 -0500 Subject: [PATCH 103/496] Drop desul_* helper functions in tasking Use desul atomics directly instead --- core/src/Kokkos_Atomic.hpp | 45 ------------------- core/src/Kokkos_TaskScheduler.hpp | 11 +++-- core/src/impl/Kokkos_TaskBase.hpp | 10 ++--- core/src/impl/Kokkos_TaskNode.hpp | 5 +-- core/src/impl/Kokkos_TaskQueue.hpp | 5 +-- core/src/impl/Kokkos_TaskQueueCommon.hpp | 10 ++--- .../impl/Kokkos_TaskQueueMemoryManager.hpp | 12 ++--- core/src/impl/Kokkos_TaskQueueMultiple.hpp | 14 +++--- core/src/impl/Kokkos_TaskQueue_impl.hpp | 43 +++++++++--------- 9 files changed, 50 insertions(+), 105 deletions(-) diff --git a/core/src/Kokkos_Atomic.hpp b/core/src/Kokkos_Atomic.hpp index 57d189cfcf..6fc903f274 100644 --- a/core/src/Kokkos_Atomic.hpp +++ b/core/src/Kokkos_Atomic.hpp @@ -48,51 +48,6 @@ #include #include -#include - -// Helper functions for places where we really should have called SeqCst atomics -// anyway These can go away when we call desul unconditionally Non-Desul -// versions are below -namespace Kokkos { -namespace Impl { -using desul::MemoryOrderSeqCst; -using desul::MemoryScopeDevice; - -template -KOKKOS_INLINE_FUNCTION void desul_atomic_dec(T* dest, MemoryOrderSeqCst, - MemoryScopeDevice) { - return desul::atomic_dec(const_cast(dest), desul::MemoryOrderSeqCst(), - desul::MemoryScopeDevice()); -} - -template -KOKKOS_INLINE_FUNCTION void desul_atomic_inc(T* dest, MemoryOrderSeqCst, - MemoryScopeDevice) { - return desul::atomic_inc(const_cast(dest), desul::MemoryOrderSeqCst(), - desul::MemoryScopeDevice()); -} - -template -KOKKOS_INLINE_FUNCTION T -desul_atomic_exchange(T* dest, const Kokkos::Impl::type_identity_t val, - MemoryOrderSeqCst, MemoryScopeDevice) { - return desul::atomic_exchange(const_cast(dest), val, - desul::MemoryOrderSeqCst(), - desul::MemoryScopeDevice()); -} - -template -KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange( - T* dest, Kokkos::Impl::type_identity_t compare, - Kokkos::Impl::type_identity_t val, MemoryOrderSeqCst, - MemoryScopeDevice) { - return desul::atomic_compare_exchange(dest, compare, val, - desul::MemoryOrderSeqCst(), - desul::MemoryScopeDevice()); -} - -} // namespace Impl -} // namespace Kokkos #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/core/src/Kokkos_TaskScheduler.hpp b/core/src/Kokkos_TaskScheduler.hpp index 690c845b30..869a5f8ec2 100644 --- a/core/src/Kokkos_TaskScheduler.hpp +++ b/core/src/Kokkos_TaskScheduler.hpp @@ -347,9 +347,8 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase { if (nullptr != t) { // Increment reference count to track subsequent assignment. // This likely has to be SeqCst - Kokkos::Impl::desul_atomic_inc(&(t->m_ref_count), - Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); + desul::atomic_inc(&(t->m_ref_count), desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); if (q != static_cast(t->m_queue)) { Kokkos::abort( "Kokkos when_all Futures must be in the same scheduler"); @@ -445,9 +444,9 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase { //} // Increment reference count to track subsequent assignment. // This increment likely has to be SeqCst - Kokkos::Impl::desul_atomic_inc(&(arg_f.m_task->m_ref_count), - Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); + desul::atomic_inc(&(arg_f.m_task->m_ref_count), + desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); dep[i] = arg_f.m_task; } } diff --git a/core/src/impl/Kokkos_TaskBase.hpp b/core/src/impl/Kokkos_TaskBase.hpp index ac99d07159..1c4c158217 100644 --- a/core/src/impl/Kokkos_TaskBase.hpp +++ b/core/src/impl/Kokkos_TaskBase.hpp @@ -174,17 +174,15 @@ class TaskBase { // Assign dependence to m_next. It will be processed in the subsequent // call to schedule. Error if the dependence is reset. - if (lock != Kokkos::Impl::desul_atomic_exchange( - &m_next, dep, Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice())) { + if (lock != desul::atomic_exchange(&m_next, dep, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice())) { Kokkos::abort("TaskScheduler ERROR: resetting task dependence"); } if (nullptr != dep) { // The future may be destroyed upon returning from this call // so increment reference count to track this assignment. - Kokkos::Impl::desul_atomic_inc(&(dep->m_ref_count), - Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); + desul::atomic_inc(&(dep->m_ref_count), desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); } } diff --git a/core/src/impl/Kokkos_TaskNode.hpp b/core/src/impl/Kokkos_TaskNode.hpp index 789ba14696..81c874b5d9 100644 --- a/core/src/impl/Kokkos_TaskNode.hpp +++ b/core/src/impl/Kokkos_TaskNode.hpp @@ -131,9 +131,8 @@ class ReferenceCountedBase { KOKKOS_INLINE_FUNCTION void increment_reference_count() { - Kokkos::Impl::desul_atomic_inc(&m_ref_count, - Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); + desul::atomic_inc(&m_ref_count, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); } }; diff --git a/core/src/impl/Kokkos_TaskQueue.hpp b/core/src/impl/Kokkos_TaskQueue.hpp index 5f182dc33f..8312fbc103 100644 --- a/core/src/impl/Kokkos_TaskQueue.hpp +++ b/core/src/impl/Kokkos_TaskQueue.hpp @@ -160,9 +160,8 @@ class TaskQueue : public TaskQueueBase { task_root_type* const rhs) { if (*lhs) decrement(*lhs); if (rhs) { - Kokkos::Impl::desul_atomic_inc(&rhs->m_ref_count, - Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); + desul::atomic_inc(&rhs->m_ref_count, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); } // Force write of *lhs diff --git a/core/src/impl/Kokkos_TaskQueueCommon.hpp b/core/src/impl/Kokkos_TaskQueueCommon.hpp index 18dc3c757b..4ff9c6b700 100644 --- a/core/src/impl/Kokkos_TaskQueueCommon.hpp +++ b/core/src/impl/Kokkos_TaskQueueCommon.hpp @@ -129,17 +129,15 @@ class TaskQueueCommonMixin { KOKKOS_INLINE_FUNCTION void _increment_ready_count() { // TODO @tasking @memory_order DSH memory order - Kokkos::Impl::desul_atomic_inc(&this->m_ready_count, - Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); + desul::atomic_inc(&this->m_ready_count, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); } KOKKOS_INLINE_FUNCTION void _decrement_ready_count() { // TODO @tasking @memory_order DSH memory order - Kokkos::Impl::desul_atomic_dec(&this->m_ready_count, - Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); + desul::atomic_dec(&this->m_ready_count, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); } public: diff --git a/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp b/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp index aef919e834..e2bb9d2b61 100644 --- a/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp +++ b/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp @@ -73,9 +73,9 @@ class TaskQueueMemoryManager : public TaskQueueBase { } else { void* data = m_pool.allocate(static_cast(requested_size)); - Kokkos::Impl::desul_atomic_inc( - &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); // TODO? memory_order_relaxed + desul::atomic_inc( + &m_count_alloc, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); // TODO? memory_order_relaxed // TODO @tasking @minor DSH make this thread safe? (otherwise, it's just // an approximation, which is probably fine...) if (m_max_alloc < m_count_alloc) m_max_alloc = m_count_alloc; @@ -171,9 +171,9 @@ class TaskQueueMemoryManager : public TaskQueueBase { KOKKOS_INLINE_FUNCTION void deallocate( PoolAllocatedObjectBase&& obj) { m_pool.deallocate((void*)&obj, 1); - Kokkos::Impl::desul_atomic_dec( - &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); // TODO? memory_order_relaxed + desul::atomic_dec( + &m_count_alloc, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); // TODO? memory_order_relaxed } KOKKOS_INLINE_FUNCTION diff --git a/core/src/impl/Kokkos_TaskQueueMultiple.hpp b/core/src/impl/Kokkos_TaskQueueMultiple.hpp index e653f70fe9..39d70f331d 100644 --- a/core/src/impl/Kokkos_TaskQueueMultiple.hpp +++ b/core/src/impl/Kokkos_TaskQueueMultiple.hpp @@ -128,14 +128,12 @@ class TaskQueueMultiple : public TaskQueue { // task stolen. // first increment our ready count, then decrement the ready count // on the other queue: - Kokkos::Impl::desul_atomic_inc( - &this->m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); // TODO? - // memory_order_relaxed - Kokkos::Impl::desul_atomic_dec( - &steal_from.m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); // TODO? - // memory_order_relaxed + desul::atomic_inc( + &this->m_ready_count, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); // TODO? memory_order_relaxed + desul::atomic_dec( + &steal_from.m_ready_count, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); // TODO? memory_order_relaxed return rv; } } diff --git a/core/src/impl/Kokkos_TaskQueue_impl.hpp b/core/src/impl/Kokkos_TaskQueue_impl.hpp index 68ff36579f..074dc7bb98 100644 --- a/core/src/impl/Kokkos_TaskQueue_impl.hpp +++ b/core/src/impl/Kokkos_TaskQueue_impl.hpp @@ -119,9 +119,9 @@ KOKKOS_FUNCTION void *TaskQueue::allocate(size_t n) { void *const p = m_memory.allocate(n); if (p) { - Kokkos::Impl::desul_atomic_inc( - &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); // TODO? memory_order_relaxed + desul::atomic_inc( + &m_count_alloc, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); // TODO? memory_order_relaxed // if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ; } @@ -133,9 +133,8 @@ template KOKKOS_FUNCTION void TaskQueue::deallocate(void *p, size_t n) { m_memory.deallocate(p, n); - Kokkos::Impl::desul_atomic_dec( - &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); // TODO? memory_order_relaxed + desul::atomic_dec(&m_count_alloc, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); // TODO? memory_order_relaxed } //---------------------------------------------------------------------------- @@ -186,9 +185,9 @@ KOKKOS_FUNCTION bool TaskQueue::push_task( // *queue = task; // } // old_head = *queue; - old_head = Kokkos::Impl::desul_atomic_compare_exchange( + old_head = desul::atomic_compare_exchange( const_cast(queue), old_head, task, - Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice()); + desul::MemoryOrderSeqCst(), desul::MemoryScopeDevice()); if (old_head_tmp == old_head) return true; } @@ -237,9 +236,9 @@ TaskQueue::pop_ready_task( task_root_type *const x = task; // task = Kokkos::atomic_compare_exchange(queue, x, lock); - task = Kokkos::Impl::desul_atomic_compare_exchange( - const_cast(queue), x, lock, - Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice()); + task = desul::atomic_compare_exchange(const_cast(queue), + x, lock, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); if (x == task) { // CAS succeeded and queue is locked @@ -383,9 +382,9 @@ KOKKOS_FUNCTION void TaskQueue::schedule_runnable( // to track number of ready + executing tasks. // The ready count will be decremented when the task is complete. - Kokkos::Impl::desul_atomic_inc( - &m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); // TODO? memory_order_relaxed + desul::atomic_inc( + &m_ready_count, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); // TODO? memory_order_relaxed task_root_type *volatile *const ready_queue = &m_ready[t.m_priority][t.m_task_type]; @@ -538,9 +537,9 @@ KOKKOS_FUNCTION void TaskQueue::reschedule( task_root_type *const zero = nullptr; task_root_type *const lock = (task_root_type *)task_root_type::LockTag; - if (lock != Kokkos::Impl::desul_atomic_exchange( - &task->m_next, zero, Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice())) { + if (lock != desul::atomic_exchange(&task->m_next, zero, + desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice())) { Kokkos::abort("TaskScheduler::respawn ERROR: already respawned"); } } @@ -587,9 +586,9 @@ KOKKOS_FUNCTION void TaskQueue::complete( // Stop other tasks from adding themselves to this task's wait queue // by locking the head of this task's wait queue. - task_root_type *x = Kokkos::Impl::desul_atomic_exchange( + task_root_type *x = desul::atomic_exchange( const_cast(&t.m_wait), lock, - Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice()); + desul::MemoryOrderSeqCst(), desul::MemoryScopeDevice()); if (x != (task_root_type *)lock) { // This thread has transitioned this 'task' to complete. @@ -632,9 +631,9 @@ KOKKOS_FUNCTION void TaskQueue::complete( // A runnable task was popped from a ready queue and executed. // If respawned into a ready queue then the ready count was incremented // so decrement whether respawned or not. - Kokkos::Impl::desul_atomic_dec( - &m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(), - Kokkos::Impl::MemoryScopeDevice()); // TODO? memory_order_relaxed + desul::atomic_dec( + &m_ready_count, desul::MemoryOrderSeqCst(), + desul::MemoryScopeDevice()); // TODO? memory_order_relaxed } } From b0be8e6c7dcc9bf388c17c05be4083d77970c6ad Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 25 Jan 2023 16:35:49 -0500 Subject: [PATCH 104/496] Disable tests failing with SYCL+Cuda after update to oneAPI 2023.0.0 --- algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp | 3 +++ algorithms/unit_tests/TestStdAlgorithmsRemove.cpp | 3 +++ .../unit_tests/TestStdAlgorithmsRemoveCopy.cpp | 3 +++ .../unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp | 3 +++ algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp | 3 +++ algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp | 3 +++ algorithms/unit_tests/TestStdAlgorithmsRotate.cpp | 3 +++ algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp | 3 +++ .../unit_tests/TestStdAlgorithmsShiftRight.cpp | 3 +++ algorithms/unit_tests/TestStdAlgorithmsUnique.cpp | 3 +++ .../unit_tests/TestStdAlgorithmsUniqueCopy.cpp | 3 +++ containers/unit_tests/TestScatterView.hpp | 7 +++++++ core/perf_test/PerfTest_ExecSpacePartitioning.cpp | 12 ++++++++++++ core/unit_test/TestCrs.hpp | 6 ++++++ core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp | 8 +++++--- 15 files changed, 63 insertions(+), 3 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp index e21d50f69b..a1d6548267 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp @@ -270,6 +270,9 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, copy_if) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp b/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp index 8832d71f95..e075ca78e0 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp @@ -195,6 +195,9 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, remove) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp index 949f8f60c9..59fd63a0b1 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp @@ -224,6 +224,9 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, remove_copy) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp index 9dc1e4a7e1..c4d6e99f2a 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp @@ -208,6 +208,9 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, remove_copy_if) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp index e9d15f29d8..2e96f8727e 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp @@ -192,6 +192,9 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, remove_if) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp index f481144e1c..548eb347b2 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp @@ -216,6 +216,9 @@ void run_all_scenarios() { } TEST(std_algorithms_replace_ops_test, replace_if) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp b/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp index a5a6f99bac..4de968c07c 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp @@ -234,6 +234,9 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, rotate) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp b/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp index 8e4ced9635..f3e3fc6260 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp @@ -202,6 +202,9 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, shift_left) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp b/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp index a1614be027..d6b631ea7a 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp @@ -206,6 +206,9 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, shift_right) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp b/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp index a810d31d82..636e5f15ba 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp @@ -273,6 +273,9 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, unique) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp index f609d8517e..9116ca263b 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp @@ -322,6 +322,9 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, unique_copy) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/containers/unit_tests/TestScatterView.hpp b/containers/unit_tests/TestScatterView.hpp index 347e914ea5..8dc3c423a7 100644 --- a/containers/unit_tests/TestScatterView.hpp +++ b/containers/unit_tests/TestScatterView.hpp @@ -758,6 +758,9 @@ void test_scatter_view(int64_t n) { } TEST(TEST_CATEGORY, scatterview) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif test_scatter_view( 10); @@ -789,6 +792,10 @@ TEST(TEST_CATEGORY, scatterview) { } TEST(TEST_CATEGORY, scatterview_devicetype) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif + using device_type = Kokkos::Device; diff --git a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index 48419c7ec6..2a07dfa286 100644 --- a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -155,6 +155,10 @@ struct FunctorTeamReduce { }; TEST(default_exec, overlap_range_policy) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif + int N = 2000; int M = 10000; int R = 10; @@ -316,6 +320,10 @@ TEST(default_exec, overlap_range_policy) { } TEST(default_exec, overlap_mdrange_policy) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif + int N = 200; int M = 10000; int R = 10; @@ -495,6 +503,10 @@ TEST(default_exec, overlap_mdrange_policy) { } TEST(default_exec, overlap_team_policy) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif + int N = 20; int M = 1000000; int R = 10; diff --git a/core/unit_test/TestCrs.hpp b/core/unit_test/TestCrs.hpp index 34fc4d0514..9efebb8a54 100644 --- a/core/unit_test/TestCrs.hpp +++ b/core/unit_test/TestCrs.hpp @@ -174,6 +174,9 @@ void test_constructor(std::int32_t nrows) { } // anonymous namespace TEST(TEST_CATEGORY, crs_count_fill) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif test_count_fill(0); test_count_fill(1); test_count_fill(2); @@ -185,6 +188,9 @@ TEST(TEST_CATEGORY, crs_count_fill) { } TEST(TEST_CATEGORY, crs_copy_constructor) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif test_constructor(0); test_constructor(1); test_constructor(2); diff --git a/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp b/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp index 11207a5480..8f2d6c68bd 100644 --- a/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp +++ b/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp @@ -76,13 +76,12 @@ void sycl_queue_scratch_test( Kokkos::Experimental::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); - sycl::default_selector device_selector; - sycl::queue queue(default_context, device_selector); + sycl::queue queue(default_context, sycl::default_selector_v); std::array sycl; for (int i = 0; i < K; i++) { sycl[i] = Kokkos::Experimental::SYCL( - sycl::queue(default_context, device_selector)); + sycl::queue(default_context, sycl::default_selector_v)); } // Test that growing scratch size in subsequent calls doesn't crash things @@ -111,6 +110,9 @@ void sycl_queue_scratch_test( } // namespace Impl TEST(sycl, team_scratch_1_queues) { +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "skipping for SYCL+Cuda"; +#endif int N = 1000000; int T = 10; int M_base = 150; From 0180ff565b6cf8e265429c46b4c2e5666de81f07 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 25 Jan 2023 16:36:57 -0500 Subject: [PATCH 105/496] Update architecture flags for SYCL+Cuda --- cmake/kokkos_arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index b051f8e3bd..16f742029d 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -704,7 +704,7 @@ IF (KOKKOS_ENABLE_SYCL) IF(CUDA_ARCH_ALREADY_SPECIFIED) IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend "${CUDA_ARCH_FLAG}=${KOKKOS_CUDA_ARCH_FLAG}" + DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} ) ELSE() MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") From b5b05044c9c096de9e6bd92bac662c62f4485e83 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 25 Jan 2023 16:42:07 -0500 Subject: [PATCH 106/496] Update minimal compiler requirements for SYCL --- cmake/kokkos_compiler_id.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/kokkos_compiler_id.cmake b/cmake/kokkos_compiler_id.cmake index 27a3102a6b..fc4c06ce2b 100644 --- a/cmake/kokkos_compiler_id.cmake +++ b/cmake/kokkos_compiler_id.cmake @@ -152,7 +152,7 @@ SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) 10.0.0 or SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 8.2.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 19.0.5 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) 2021.1.1 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) 2022.0.0 or higher") #FIXME +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) 2023.0.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC 11.0.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC 5.2.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI 22.3 or higher") @@ -181,7 +181,7 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2022.0.0) #FIXME 2022.2.0 + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2023.0.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) From 05d008dd933959dd56d059b0881bacbe237e0488 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 25 Jan 2023 16:44:53 -0500 Subject: [PATCH 107/496] Address deprecations in oneAPI 2023.0.0 --- core/src/SYCL/Kokkos_SYCL.cpp | 17 ------- core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp | 28 +++-------- core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp | 9 ++-- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 50 ++++++++----------- core/unit_test/sycl/TestSYCL_InterOp_Init.cpp | 3 +- .../sycl/TestSYCL_InterOp_Init_Context.cpp | 6 +-- .../sycl/TestSYCL_InterOp_Streams.cpp | 3 +- 7 files changed, 36 insertions(+), 80 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index e38b011c89..8ef4b13012 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -144,7 +144,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, using namespace sycl::info; return os << "Name: " << device.get_info() << "\nDriver Version: " << device.get_info() - << "\nIs Host: " << device.is_host() << "\nIs CPU: " << device.is_cpu() << "\nIs GPU: " << device.is_gpu() << "\nIs Accelerator: " << device.is_accelerator() @@ -184,7 +183,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, << "\nNative Vector Width Half: " << device.get_info() << "\nAddress Bits: " << device.get_info() - << "\nImage Support: " << device.get_info() << "\nMax Mem Alloc Size: " << device.get_info() << "\nMax Read Image Args: " @@ -217,26 +215,11 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, << "\nLocal Mem Size: " << device.get_info() << "\nError Correction Support: " << device.get_info() - << "\nHost Unified Memory: " - << device.get_info() << "\nProfiling Timer Resolution: " << device.get_info() - << "\nIs Endian Little: " - << device.get_info() << "\nIs Available: " << device.get_info() - << "\nIs Compiler Available: " - << device.get_info() - << "\nIs Linker Available: " - << device.get_info() - << "\nQueue Profiling: " - << device.get_info() << "\nVendor: " << device.get_info() - << "\nProfile: " << device.get_info() << "\nVersion: " << device.get_info() - << "\nPrintf Buffer Size: " - << device.get_info() - << "\nPreferred Interop User Sync: " - << device.get_info() << "\nPartition Max Sub Devices: " << device.get_info() << "\nReference Count: " diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp index c7959c1c1c..5144e57a71 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp @@ -293,12 +293,8 @@ class ParallelReduce, ReducerType, instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = - [&](sycl::accessor - local_mem, - sycl::accessor - num_teams_done, + [&](sycl::local_accessor local_mem, + sycl::local_accessor num_teams_done, sycl::device_ptr results_ptr) { const auto begin = policy.begin(); @@ -410,9 +406,7 @@ class ParallelReduce, ReducerType, }; auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { - sycl::accessor - num_teams_done(1, cgh); + sycl::local_accessor num_teams_done(1, cgh); auto dummy_reduction_lambda = reduction_lambda_factory({1, cgh}, num_teams_done, nullptr); @@ -453,10 +447,8 @@ class ParallelReduce, ReducerType, wgroup_size - 1) / wgroup_size; - sycl::accessor - local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u), - cgh); + sycl::local_accessor local_mem( + sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh); cgh.depends_on(memcpy_events); @@ -665,13 +657,9 @@ class ParallelReduce, ReducerType, if (size > 1) { auto n_wgroups = (size + wgroup_size - 1) / wgroup_size; auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { - sycl::accessor - local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u), - cgh); - sycl::accessor - num_teams_done(1, cgh); + sycl::local_accessor local_mem( + sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh); + sycl::local_accessor num_teams_done(1, cgh); const BarePolicy bare_policy = m_policy; diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp index cf651ced95..76c73b3452 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp @@ -136,11 +136,10 @@ class ParallelScanSYCLBase { q.get_device() .template get_info() .front(); - sycl::accessor - local_mem(sycl::range<1>((wgroup_size + min_subgroup_size - 1) / - min_subgroup_size), - cgh); + sycl::local_accessor local_mem( + sycl::range<1>((wgroup_size + min_subgroup_size - 1) / + min_subgroup_size), + cgh); cgh.parallel_for( sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp index 601580b2d8..489180361f 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -398,12 +398,10 @@ class ParallelFor, auto parallel_for_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least for // host queues - sycl::accessor - team_scratch_memory_L0( - sycl::range<1>( - std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))), - cgh); + sycl::local_accessor team_scratch_memory_L0( + sycl::range<1>( + std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))), + cgh); // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; @@ -432,8 +430,7 @@ class ParallelFor, auto max_sg_size = kernel .get_info( - q.get_device(), - sycl::range<3>(m_team_size, m_vector_size, 1)); + q.get_device()); auto final_vector_size = std::min(m_vector_size, max_sg_size); // FIXME_SYCL For some reason, explicitly enforcing the kernel bundle to // be used gives a runtime error. @@ -592,12 +589,10 @@ class ParallelReduce, auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least // for host queues - sycl::accessor - team_scratch_memory_L0( - sycl::range<1>( - std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))), - cgh); + sycl::local_accessor team_scratch_memory_L0( + sycl::range<1>( + std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))), + cgh); // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; @@ -645,12 +640,10 @@ class ParallelReduce, // FIXME_SYCL accessors seem to need a size greater than zero at least // for host queues - sycl::accessor - team_scratch_memory_L0( - sycl::range<1>( - std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))), - cgh); + sycl::local_accessor team_scratch_memory_L0( + sycl::range<1>( + std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))), + cgh); // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; @@ -658,9 +651,7 @@ class ParallelReduce, sycl::device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto team_reduction_factory = - [&](sycl::accessor - local_mem, + [&](sycl::local_accessor local_mem, sycl::device_ptr results_ptr) { sycl::global_ptr device_accessible_result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr; @@ -793,7 +784,7 @@ class ParallelReduce, }(); auto max_sg_size = kernel.get_info< sycl::info::kernel_device_specific::max_sub_group_size>( - q.get_device(), sycl::range<3>(m_team_size, m_vector_size, 1)); + q.get_device()); auto final_vector_size = std::min(m_vector_size, max_sg_size); // FIXME_SYCL For some reason, explicitly enforcing the kernel bundle to // be used gives a runtime error. @@ -802,12 +793,11 @@ class ParallelReduce, auto wgroup_size = m_team_size * final_vector_size; std::size_t size = std::size_t(m_league_size) * wgroup_size; - sycl::accessor - local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u) + - (sizeof(unsigned int) + sizeof(value_type) - 1) / - sizeof(value_type), - cgh); + sycl::local_accessor local_mem( + sycl::range<1>(wgroup_size) * std::max(value_count, 1u) + + (sizeof(unsigned int) + sizeof(value_type) - 1) / + sizeof(value_type), + cgh); const auto init_size = std::max((size + wgroup_size - 1) / wgroup_size, 1); diff --git a/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp b/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp index 25c5c9a50c..4b2530316d 100644 --- a/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp +++ b/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp @@ -29,8 +29,7 @@ TEST(sycl, raw_sycl_interop) { Kokkos::Experimental::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); - sycl::default_selector device_selector; - sycl::queue queue(default_context, device_selector); + sycl::queue queue(default_context, sycl::default_selector_v); constexpr int n = 100; int* p = sycl::malloc_device(n, queue); { diff --git a/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp b/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp index 336a5d59c3..bbd3d2af94 100644 --- a/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp +++ b/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp @@ -27,8 +27,7 @@ TEST(sycl, raw_sycl_interop_context_1) { Kokkos::Experimental::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); - sycl::default_selector device_selector; - sycl::queue queue(default_context, device_selector); + sycl::queue queue(default_context, sycl::default_selector_v); constexpr int n = 100; int* p = sycl::malloc_device(n, queue); @@ -61,8 +60,7 @@ TEST(sycl, raw_sycl_interop_context_2) { Kokkos::Experimental::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); - sycl::default_selector device_selector; - sycl::queue queue(default_context, device_selector); + sycl::queue queue(default_context, sycl::default_selector_v); constexpr int n = 100; Kokkos::Experimental::SYCL space(queue); diff --git a/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp b/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp index 13810d861c..0cfaab8813 100644 --- a/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp +++ b/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp @@ -25,8 +25,7 @@ TEST(sycl, raw_sycl_queues) { Kokkos::Experimental::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); - sycl::default_selector device_selector; - sycl::queue queue(default_context, device_selector); + sycl::queue queue(default_context, sycl::default_selector_v); int* p = sycl::malloc_device(100, queue); using MemorySpace = typename TEST_EXECSPACE::memory_space; From 61d8569fe7828a29549e1a0d1ea629d182afd86a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 25 Jan 2023 16:46:05 -0500 Subject: [PATCH 108/496] Update Dockerfile used for SYCL+Cuda CI --- scripts/docker/Dockerfile.sycl | 37 +++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index d7d764e8aa..1b93199918 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -47,7 +47,7 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO ENV PATH=${CMAKE_DIR}/bin:$PATH ENV SYCL_DIR=/opt/sycl -RUN SYCL_VERSION=20220112 && \ +RUN SYCL_VERSION=20221201 && \ SYCL_URL=https://github.com/intel/llvm/archive/sycl-nightly && \ SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \ SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ @@ -55,10 +55,37 @@ RUN SYCL_VERSION=20220112 && \ mkdir llvm && \ tar -xf ${SYCL_ARCHIVE} -C llvm --strip-components=1 && \ cd llvm && \ - python3 buildbot/configure.py --cuda && \ - python3 buildbot/compile.py && \ - mkdir -p ${SYCL_DIR} && \ - mv ${SCRATCH_DIR}/llvm/build/install/* ${SYCL_DIR} && \ + mkdir build && \ + cd build && \ + cmake -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \ + -DLLVM_EXTERNAL_PROJECTS="sycl;llvm-spirv;opencl;xpti;xptifw;libdevice" \ + -DLLVM_EXTERNAL_SYCL_SOURCE_DIR=/scratch/llvm/sycl \ + -DLLVM_EXTERNAL_LLVM_SPIRV_SOURCE_DIR=/scratch/llvm/llvm-spirv \ + -DLLVM_EXTERNAL_XPTI_SOURCE_DIR=/scratch/llvm/xpti \ + -DXPTI_SOURCE_DIR=/scratch/llvm/xpti \ + -DLLVM_EXTERNAL_XPTIFW_SOURCE_DIR=/scratch/llvm/xptifw \ + -DLLVM_EXTERNAL_LIBDEVICE_SOURCE_DIR=/scratch/llvm/libdevice \ + -DLLVM_ENABLE_PROJECTS="clang;sycl;llvm-spirv;opencl;xpti;xptifw;libdevice;libclc" \ + -DLIBCLC_TARGETS_TO_BUILD=";nvptx64--;nvptx64--nvidiacl" \ + -DLIBCLC_GENERATE_REMANGLED_VARIANTS=ON \ + -DLLVM_BUILD_TOOLS=OFF \ + -DSYCL_ENABLE_WERROR=OFF \ + -DCMAKE_INSTALL_PREFIX=${SYCL_DIR} \ + -DSYCL_INCLUDE_TESTS=OFF \ + -DLLVM_ENABLE_DOXYGEN=OFF \ + -DLLVM_ENABLE_SPHINX=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DSYCL_ENABLE_XPTI_TRACING=ON \ + -DLLVM_ENABLE_LLD=OFF \ + -DXPTI_ENABLE_WERROR=OFF \ + -DSYCL_ENABLE_PLUGINS="opencl;cuda" \ + /scratch/llvm/llvm && \ + ninja -j8 deploy-sycl-toolchain && \ + ninja -j8 install && \ + cp bin/* ${SYCL_DIR}/bin && \ echo "${SYCL_DIR}/lib" > /etc/ld.so.conf.d/sycl.conf && ldconfig && \ rm -rf ${SCRATCH_DIR} ENV PATH=${SYCL_DIR}/bin:$PATH From 5d93865b0d4e2b0bca636772139296f227fa3d19 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 25 Jan 2023 22:50:41 -0500 Subject: [PATCH 109/496] Break lock array dependence of Cuda and HIP teams impl --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 9 +++--- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 3 +- core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 32 ++++++++++-------- core/src/HIP/Kokkos_HIP_Instance.cpp | 8 +++-- core/src/HIP/Kokkos_HIP_Instance.hpp | 3 +- core/src/HIP/Kokkos_HIP_Parallel_Team.hpp | 36 ++++++++++++--------- 6 files changed, 54 insertions(+), 37 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 3dbe179d66..364d8c6416 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -420,10 +419,11 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default m_team_scratch_ptr[i] = nullptr; } + m_num_scratch_locks = concurrency(); KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaMalloc(&m_scratch_locks, sizeof(int32_t) * concurrency())); + cudaMalloc(&m_scratch_locks, sizeof(int32_t) * m_num_scratch_locks)); KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaMemset(m_scratch_locks, 0, sizeof(int32_t) * concurrency())); + cudaMemset(m_scratch_locks, 0, sizeof(int32_t) * m_num_scratch_locks)); } //---------------------------------------------------------------------------- @@ -618,7 +618,8 @@ void CudaInternal::finalize() { } KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(m_scratch_locks)); - m_scratch_locks = nullptr; + m_scratch_locks = nullptr; + m_num_scratch_locks = 0; } //---------------------------------------------------------------------------- diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index af34a5b9db..68abc2140e 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -138,7 +138,8 @@ class CudaInternal { mutable int64_t m_team_scratch_current_size[10]; mutable void* m_team_scratch_ptr[10]; mutable std::atomic_int m_team_scratch_pool[10]; - std::int32_t* m_scratch_locks; + int32_t* m_scratch_locks; + size_t m_num_scratch_locks; bool was_initialized = false; bool was_finalized = false; diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index cdff86ccfc..5855e38847 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -408,14 +407,15 @@ class TeamPolicyInternal }; __device__ inline int64_t cuda_get_scratch_index(Cuda::size_type league_size, - int32_t* scratch_locks) { + int32_t* scratch_locks, + size_t num_scratch_locks) { int64_t threadid = 0; __shared__ int64_t base_thread_id; if (threadIdx.x == 0 && threadIdx.y == 0) { int64_t const wraparound_len = Kokkos::max( - int64_t(1), Kokkos::min(int64_t(league_size), - (int64_t(g_device_cuda_lock_arrays.n)) / - (blockDim.x * blockDim.y))); + int64_t(1), + Kokkos::min(int64_t(league_size), + int64_t(num_scratch_locks) / (blockDim.x * blockDim.y))); threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len; threadid *= blockDim.x * blockDim.y; int done = 0; @@ -477,6 +477,7 @@ class ParallelFor, size_t m_scratch_size[2]; int m_scratch_pool_id = -1; int32_t* m_scratch_locks; + size_t m_num_scratch_locks; template __device__ inline std::enable_if_t::value> exec_team( @@ -497,7 +498,8 @@ class ParallelFor, // Iterate this block through the league int64_t threadid = 0; if (m_scratch_size[1] > 0) { - threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks); + threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); } const int int_league_size = (int)m_league_size; @@ -668,6 +670,7 @@ class ParallelReduce, size_t m_scratch_size[2]; int m_scratch_pool_id = -1; int32_t* m_scratch_locks; + size_t m_num_scratch_locks; const size_type m_league_size; int m_team_size; const size_type m_vector_size; @@ -690,7 +693,8 @@ class ParallelReduce, __device__ inline void operator()() const { int64_t threadid = 0; if (m_scratch_size[1] > 0) { - threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks); + threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); } using ReductionTag = std::conditional_t, m_shmem_size = m_policy.scratch_size(0, m_team_size) + FunctorTeamShmemSize::value(arg_functor, m_team_size); - m_scratch_size[0] = m_shmem_size; - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; if (m_team_size <= 0) { m_scratch_ptr[1] = nullptr; } else { @@ -1031,9 +1036,10 @@ class ParallelReduce, m_shmem_size = m_policy.scratch_size(0, m_team_size) + FunctorTeamShmemSize::value(arg_functor, m_team_size); - m_scratch_size[0] = m_shmem_size; - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; if (m_team_size <= 0) { m_scratch_ptr[1] = nullptr; } else { diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index c66cee0c28..0927a0d99d 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -180,10 +180,11 @@ void HIPInternal::initialize(hipStream_t stream, bool manage_stream) { Kokkos::Impl::throw_runtime_exception(msg.str()); } + m_num_scratch_locks = concurrency(); KOKKOS_IMPL_HIP_SAFE_CALL( - hipMalloc(&m_scratch_locks, sizeof(int32_t) * concurrency())); + hipMalloc(&m_scratch_locks, sizeof(int32_t) * m_num_scratch_locks)); KOKKOS_IMPL_HIP_SAFE_CALL( - hipMemset(m_scratch_locks, 0, sizeof(int32_t) * concurrency())); + hipMemset(m_scratch_locks, 0, sizeof(int32_t) * m_num_scratch_locks)); } //---------------------------------------------------------------------------- @@ -363,7 +364,8 @@ void HIPInternal::finalize() { } KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(m_scratch_locks)); - m_scratch_locks = nullptr; + m_scratch_locks = nullptr; + m_num_scratch_locks = 0; } //---------------------------------------------------------------------------- diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp index 06fab84b56..51b3f79a9d 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -104,7 +104,8 @@ class HIPInternal { mutable int64_t m_team_scratch_current_size[10] = {}; mutable void *m_team_scratch_ptr[10] = {}; mutable std::atomic_int m_team_scratch_pool[10] = {}; - std::int32_t *m_scratch_locks; + int32_t *m_scratch_locks = nullptr; + size_t m_num_scratch_locks = 0; bool was_finalized = false; diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index 442ca8aef2..f823514042 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -22,7 +22,6 @@ #if defined(__HIPCC__) #include -#include #include #include #include @@ -384,14 +383,14 @@ class TeamPolicyInternal }; __device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size, - int32_t* scratch_locks) { + int32_t* scratch_locks, + size_t num_scratch_locks) { int64_t threadid = 0; __shared__ int64_t base_thread_id; if (threadIdx.x == 0 && threadIdx.y == 0) { int64_t const wraparound_len = Kokkos::min(int64_t(league_size), - (int64_t(Kokkos::Impl::g_device_hip_lock_arrays.n)) / - (blockDim.x * blockDim.y)); + int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)); threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len; threadid *= blockDim.x * blockDim.y; int done = 0; @@ -448,6 +447,7 @@ class ParallelFor, HIP> { size_t m_scratch_size[2]; int m_scratch_pool_id = -1; int32_t* m_scratch_locks; + size_t m_num_scratch_locks; template __device__ inline std::enable_if_t::value> exec_team( @@ -466,7 +466,8 @@ class ParallelFor, HIP> { // Iterate this block through the league int64_t threadid = 0; if (m_scratch_size[1] > 0) { - threadid = hip_get_scratch_index(m_league_size, m_scratch_locks); + threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); } int const int_league_size = static_cast(m_league_size); @@ -514,9 +515,10 @@ class ParallelFor, HIP> { m_shmem_size = (m_policy.scratch_size(0, m_team_size) + FunctorTeamShmemSize::value(m_functor, m_team_size)); - m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; + m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; // Functor's reduce memory, team scan memory, and team shared memory depend // upon team size. @@ -618,6 +620,7 @@ class ParallelReduce, size_t m_scratch_size[2]; int m_scratch_pool_id = -1; int32_t* m_scratch_locks; + size_t m_num_scratch_locks; const size_type m_league_size; int m_team_size; const size_type m_vector_size; @@ -656,7 +659,8 @@ class ParallelReduce, __device__ inline void operator()() const { int64_t threadid = 0; if (m_scratch_size[1] > 0) { - threadid = hip_get_scratch_index(m_league_size, m_scratch_locks); + threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); } using ReductionTag = std::conditional_t, m_shmem_size = m_policy.scratch_size(0, m_team_size) + FunctorTeamShmemSize::value(arg_functor, m_team_size); - m_scratch_size[0] = m_shmem_size; - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; if (m_team_size <= 0) { m_scratch_ptr[1] = nullptr; } else { @@ -936,9 +941,10 @@ class ParallelReduce, m_shmem_size = m_policy.scratch_size(0, m_team_size) + FunctorTeamShmemSize::value(arg_functor, m_team_size); - m_scratch_size[0] = m_shmem_size; - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; if (m_team_size <= 0) { m_scratch_ptr[1] = nullptr; } else { From 258bac69a8c3ca37d6b458c47924f16c31e6808f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 26 Jan 2023 12:36:41 -0500 Subject: [PATCH 110/496] Add unit test capturing Tpetra custom atomics use case --- core/unit_test/TestAtomics.hpp | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/core/unit_test/TestAtomics.hpp b/core/unit_test/TestAtomics.hpp index e5866bb89b..d9685868d0 100644 --- a/core/unit_test/TestAtomics.hpp +++ b/core/unit_test/TestAtomics.hpp @@ -539,4 +539,45 @@ TEST(TEST_CATEGORY, atomics) { #endif } +// see https://github.com/trilinos/Trilinos/pull/11506 +struct TpetraUseCase { + template + struct WrapScalarAndCompareAbsMax { + Scalar value; + + private: + friend KOKKOS_FUNCTION bool operator<( + WrapScalarAndCompareAbsMax const& lhs, + WrapScalarAndCompareAbsMax const& rhs) { + return Kokkos::abs(lhs.value) < Kokkos::abs(rhs.value); + } + friend KOKKOS_FUNCTION bool operator>( + WrapScalarAndCompareAbsMax const& lhs, + WrapScalarAndCompareAbsMax const& rhs) { + return Kokkos::abs(lhs.value) > Kokkos::abs(rhs.value); + } + }; + + using T = int; + Kokkos::View d_{"lbl"}; + KOKKOS_FUNCTION void operator()(int i) const { + // 0, -1, 2, -3, ... + auto v_i = static_cast(i); + if (i % 2 == 1) v_i = -v_i; + Kokkos::atomic_max(reinterpret_cast*>(&d_()), + WrapScalarAndCompareAbsMax{v_i}); + } + TpetraUseCase() { + Kokkos::deep_copy(d_, Kokkos::Experimental::finite_min_v); + Kokkos::parallel_for(10, *this); + } + void check() { + T v; + Kokkos::deep_copy(v, d_); + ASSERT_EQ(v, -9); + } +}; + +TEST(TEST_CATEGORY, atomics_tpetra_max_abs) { TpetraUseCase().check(); } + } // namespace Test From 2f07a04e2d64f683a59b5d2d09270226b6e7f381 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 26 Jan 2023 14:23:18 -0500 Subject: [PATCH 111/496] Fix initial value (identity element) for max abs Co-authored-by: Daniel Arndt --- core/unit_test/TestAtomics.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/core/unit_test/TestAtomics.hpp b/core/unit_test/TestAtomics.hpp index d9685868d0..0d87bc3b79 100644 --- a/core/unit_test/TestAtomics.hpp +++ b/core/unit_test/TestAtomics.hpp @@ -567,10 +567,9 @@ struct TpetraUseCase { Kokkos::atomic_max(reinterpret_cast*>(&d_()), WrapScalarAndCompareAbsMax{v_i}); } - TpetraUseCase() { - Kokkos::deep_copy(d_, Kokkos::Experimental::finite_min_v); - Kokkos::parallel_for(10, *this); - } + + TpetraUseCase() { Kokkos::parallel_for(10, *this); } + void check() { T v; Kokkos::deep_copy(v, d_); From a6a02379016ef23eed28d57b528700977ef6eaf7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 26 Jan 2023 17:42:08 -0500 Subject: [PATCH 112/496] Deprecate `KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_*` macros --- core/src/Kokkos_Macros.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index c1bd64b652..84aeeab70e 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -451,6 +451,7 @@ //---------------------------------------------------------------------------- // Determine for what space the code is being compiled: +#if defined(KOKKOS_ENABLE_DEPRECARED_CODE_4) #if defined(__CUDACC__) && defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA @@ -463,6 +464,7 @@ #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST #endif +#endif //---------------------------------------------------------------------------- // Remove surrounding parentheses if present From 6935f7054ad991369dc4e0342821000eef718750 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Thu, 26 Jan 2023 12:19:59 -0700 Subject: [PATCH 113/496] Intel ICE Sacado: rewrite HostIterateTile The instance now owns a copy of the functor and mdrange policy. This allows removal of instances of both in the various ParallelFor and ParallelReduce classes. That change in and of itself is not sufficient to fix the Intel ICE, but it is part of the necessary change. --- core/src/impl/KokkosExp_Host_IterateTile.hpp | 44 ++++++++------------ 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/core/src/impl/KokkosExp_Host_IterateTile.hpp b/core/src/impl/KokkosExp_Host_IterateTile.hpp index 4aa113a158..e2b606004f 100644 --- a/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -1886,8 +1886,8 @@ struct HostIterateTile::value, int, Tag> m_tag; }; @@ -1902,12 +1902,10 @@ struct HostIterateTile - inline void operator()(IType tile_idx) const { + inline void operator()(IType tile_idx, value_type& val) const { point_type m_offset; point_type m_tiledims; @@ -1968,7 +1966,7 @@ struct HostIterateTile::apply(m_v, m_func, full_tile, m_offset, m_rp.m_tile, + Tag>::apply(val, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims); } @@ -2287,7 +2285,6 @@ struct HostIterateTile std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void::value), @@ -2302,10 +2299,10 @@ struct HostIterateTile::value, int, Tag> m_tag; }; @@ -2324,15 +2321,10 @@ struct HostIterateTile - inline void operator()(IType tile_idx) const { + inline void operator()(IType tile_idx, value_type* val) const { point_type m_offset; point_type m_tiledims; @@ -2387,7 +2379,7 @@ struct HostIterateTile::apply(m_v, m_func, full_tile, m_offset, m_rp.m_tile, + Tag>::apply(val, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims); } @@ -2706,8 +2698,6 @@ struct HostIterateTile std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void::value), void> @@ -2721,10 +2711,10 @@ struct HostIterateTile::value, int, Tag> m_tag; }; From 80c770d4eadccb9b820bfb00db5b728281090d9d Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Thu, 26 Jan 2023 12:22:22 -0700 Subject: [PATCH 114/496] Intel ICE Sacado: use new HostIterateTile API in Serial --- .../Serial/Kokkos_Serial_Parallel_MDRange.hpp | 31 ++++++++----------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index b500993d97..afdecd2f05 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -33,13 +33,12 @@ class ParallelFor, using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>; - const FunctorType m_functor; - const MDRangePolicy m_mdr_policy; + const iterate_type m_iter; void exec() const { - const typename Policy::member_type e = m_mdr_policy.m_num_tiles; + const typename Policy::member_type e = m_iter.m_rp.m_num_tiles; for (typename Policy::member_type i = 0; i < e; ++i) { - iterate_type(m_mdr_policy, m_functor)(i); + m_iter(i); } } @@ -56,7 +55,7 @@ class ParallelFor, } inline ParallelFor(const FunctorType& arg_functor, const MDRangePolicy& arg_policy) - : m_functor(arg_functor), m_mdr_policy(arg_policy) {} + : m_iter(arg_policy, arg_functor) {} }; template @@ -86,16 +85,14 @@ class ParallelReduce, ReducerType, using iterate_type = typename Kokkos::Impl::HostIterateTile; - - const FunctorType m_functor; - const MDRangePolicy m_mdr_policy; + const iterate_type m_iter; const ReducerType m_reducer; const pointer_type m_result_ptr; inline void exec(reference_type update) const { - const typename Policy::member_type e = m_mdr_policy.m_num_tiles; + const typename Policy::member_type e = m_iter.m_rp.m_num_tiles; for (typename Policy::member_type i = 0; i < e; ++i) { - iterate_type(m_mdr_policy, m_functor, update)(i); + m_iter(i, update); } } @@ -110,14 +107,14 @@ class ParallelReduce, ReducerType, return 1024; } inline void execute() const { - const size_t pool_reduce_size = - Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + const size_t pool_reduce_size = Analysis::value_size( + ReducerConditional::select(m_iter.m_func, m_reducer)); const size_t team_reduce_size = 0; // Never shrinks const size_t team_shared_size = 0; // Never shrinks const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = - m_mdr_policy.space().impl_internal_space_instance(); + m_iter.m_rp.space().impl_internal_space_instance(); // Need to lock resize_thread_team_data std::lock_guard lock( internal_instance->m_thread_team_data_mutex); @@ -132,7 +129,7 @@ class ParallelReduce, ReducerType, internal_instance->m_thread_team_data.pool_reduce_local()); typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + &ReducerConditional::select(m_iter.m_func, m_reducer)); reference_type update = final_reducer.init(ptr); @@ -148,8 +145,7 @@ class ParallelReduce, ReducerType, std::enable_if_t::value && !Kokkos::is_reducer::value, void*> = nullptr) - : m_functor(arg_functor), - m_mdr_policy(arg_policy), + : m_iter(arg_policy, arg_functor), m_reducer(InvalidType()), m_result_ptr(arg_result_view.data()) { static_assert(Kokkos::is_view::value, @@ -163,8 +159,7 @@ class ParallelReduce, ReducerType, inline ParallelReduce(const FunctorType& arg_functor, MDRangePolicy arg_policy, const ReducerType& reducer) - : m_functor(arg_functor), - m_mdr_policy(arg_policy), + : m_iter(arg_policy, arg_functor), m_reducer(reducer), m_result_ptr(reducer.view().data()) { /*static_assert( std::is_same< typename ViewType::memory_space From b98e824775130a87761398efdb8d542803e0baa1 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Thu, 26 Jan 2023 12:23:01 -0700 Subject: [PATCH 115/496] Intel ICE Sacado: use new HostIterateTile API in Threads --- .../Kokkos_Threads_Parallel_MDRange.hpp | 109 +++++++----------- 1 file changed, 43 insertions(+), 66 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp b/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp index 539308354c..35392e3bfb 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp @@ -39,20 +39,11 @@ class ParallelFor, using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>; - const FunctorType m_functor; - const MDRangePolicy m_mdr_policy; - const Policy m_policy; // construct as RangePolicy( 0, num_tiles - // ).set_chunk_size(1) in ctor - - inline static void exec_range(const MDRangePolicy &mdr_policy, - const FunctorType &functor, const Member ibeg, - const Member iend) { -#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ - defined(KOKKOS_ENABLE_PRAGMA_IVDEP) -#pragma ivdep -#endif + const iterate_type m_iter; + + inline void exec_range(const Member ibeg, const Member iend) const { for (Member i = ibeg; i < iend; ++i) { - iterate_type(mdr_policy, functor)(i); + m_iter(i); } } @@ -65,10 +56,11 @@ class ParallelFor, exec_schedule(ThreadsExec &exec, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + auto const num_tiles = self.m_iter.m_rp.m_num_tiles; + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), + exec.pool_size()); - ParallelFor::exec_range(self.m_mdr_policy, self.m_functor, range.begin(), - range.end()); + self.exec_range(range.begin(), range.end()); exec.fan_in(); } @@ -78,23 +70,21 @@ class ParallelFor, exec_schedule(ThreadsExec &exec, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + auto const num_tiles = self.m_iter.m_rp.m_num_tiles; + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), + exec.pool_size()); - exec.set_work_range(range.begin(), range.end(), self.m_policy.chunk_size()); + exec.set_work_range(range.begin(), range.end(), 1); exec.reset_steal_target(); exec.barrier(); long work_index = exec.get_work_index(); while (work_index != -1) { - const Member begin = - static_cast(work_index) * self.m_policy.chunk_size(); - const Member end = - begin + self.m_policy.chunk_size() < self.m_policy.end() - ? begin + self.m_policy.chunk_size() - : self.m_policy.end(); - - ParallelFor::exec_range(self.m_mdr_policy, self.m_functor, begin, end); + const Member begin = static_cast(work_index); + const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; + + self.exec_range(begin, end); work_index = exec.get_work_index(); } @@ -108,9 +98,7 @@ class ParallelFor, } ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy) - : m_functor(arg_functor), - m_mdr_policy(arg_policy), - m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {} + : m_iter(arg_policy, arg_functor) {} template static int max_tile_size_product(const Policy &, const Functor &) { @@ -152,22 +140,14 @@ class ParallelReduce, ReducerType, typename Kokkos::Impl::HostIterateTile; - const FunctorType m_functor; - const MDRangePolicy m_mdr_policy; - const Policy m_policy; // construct as RangePolicy( 0, num_tiles - // ).set_chunk_size(1) in ctor + const iterate_type m_iter; const ReducerType m_reducer; const pointer_type m_result_ptr; - inline static void exec_range(const MDRangePolicy &mdr_policy, - const FunctorType &functor, const Member &ibeg, - const Member &iend, reference_type update) { -#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ - defined(KOKKOS_ENABLE_PRAGMA_IVDEP) -#pragma ivdep -#endif + inline void exec_range(const Member &ibeg, const Member &iend, + reference_type update) const { for (Member i = ibeg; i < iend; ++i) { - iterate_type(mdr_policy, functor, update)(i); + m_iter(i, update); } } @@ -179,13 +159,16 @@ class ParallelReduce, ReducerType, static std::enable_if_t::value> exec_schedule(ThreadsExec &exec, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + const auto num_tiles = self.m_iter.m_rp.m_num_tiles; + const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), + exec.pool_rank(), exec.pool_size()); typename Analysis::Reducer reducer( - &ReducerConditional::select(self.m_functor, self.m_reducer)); + &ReducerConditional::select(self.m_iter.m_func, self.m_reducer)); - ParallelReduce::exec_range( - self.m_mdr_policy, self.m_functor, range.begin(), range.end(), + self.exec_range( + range.begin(), range.end(), reducer.init(static_cast(exec.reduce_memory()))); exec.fan_in_reduce(reducer); @@ -195,27 +178,25 @@ class ParallelReduce, ReducerType, static std::enable_if_t::value> exec_schedule(ThreadsExec &exec, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); - exec.set_work_range(range.begin(), range.end(), self.m_policy.chunk_size()); + const auto num_tiles = self.m_iter.m_rp.m_num_tiles; + const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), + exec.pool_rank(), exec.pool_size()); + + exec.set_work_range(range.begin(), range.end(), 1); exec.reset_steal_target(); exec.barrier(); long work_index = exec.get_work_index(); typename Analysis::Reducer reducer( - &ReducerConditional::select(self.m_functor, self.m_reducer)); + &ReducerConditional::select(self.m_iter.m_func, self.m_reducer)); reference_type update = reducer.init(static_cast(exec.reduce_memory())); while (work_index != -1) { - const Member begin = - static_cast(work_index) * self.m_policy.chunk_size(); - const Member end = - begin + self.m_policy.chunk_size() < self.m_policy.end() - ? begin + self.m_policy.chunk_size() - : self.m_policy.end(); - ParallelReduce::exec_range(self.m_mdr_policy, self.m_functor, begin, end, - update); + const Member begin = static_cast(work_index); + const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; + self.exec_range(begin, end, update); work_index = exec.get_work_index(); } @@ -224,9 +205,9 @@ class ParallelReduce, ReducerType, public: inline void execute() const { - ThreadsExec::resize_scratch( - Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)), - 0); + ThreadsExec::resize_scratch(Analysis::value_size(ReducerConditional::select( + m_iter.m_func, m_reducer)), + 0); ThreadsExec::start(&ParallelReduce::exec, this); @@ -237,7 +218,7 @@ class ParallelReduce, ReducerType, (pointer_type)ThreadsExec::root_reduce_scratch(); const unsigned n = Analysis::value_count( - ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_iter.m_func, m_reducer)); for (unsigned i = 0; i < n; ++i) { m_result_ptr[i] = data[i]; } @@ -251,9 +232,7 @@ class ParallelReduce, ReducerType, std::enable_if_t::value && !Kokkos::is_reducer::value, void *> = nullptr) - : m_functor(arg_functor), - m_mdr_policy(arg_policy), - m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), + : m_iter(arg_policy, arg_functor), m_reducer(InvalidType()), m_result_ptr(arg_result_view.data()) { static_assert(Kokkos::is_view::value, @@ -266,9 +245,7 @@ class ParallelReduce, ReducerType, inline ParallelReduce(const FunctorType &arg_functor, MDRangePolicy arg_policy, const ReducerType &reducer) - : m_functor(arg_functor), - m_mdr_policy(arg_policy), - m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), + : m_iter(arg_policy, arg_functor), m_reducer(reducer), m_result_ptr(reducer.view().data()) { /*static_assert( std::is_same< typename ViewType::memory_space From 6688cad1aef663412bc2636928c600875ed0d99f Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Thu, 26 Jan 2023 12:23:18 -0700 Subject: [PATCH 116/496] Intel ICE Sacado: use new HostIterateTile API in HPX --- core/src/Kokkos_HPX.hpp | 46 ++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/core/src/Kokkos_HPX.hpp b/core/src/Kokkos_HPX.hpp index 7b9d4f219f..18965a12ee 100644 --- a/core/src/Kokkos_HPX.hpp +++ b/core/src/Kokkos_HPX.hpp @@ -1065,17 +1065,16 @@ class ParallelFor, typename Kokkos::Impl::HostIterateTile; - const FunctorType m_functor; - const MDRangePolicy m_mdr_policy; + const iterate_type m_iter; const Policy m_policy; public: - void execute() const { dispatch_execute_task(this, m_mdr_policy.space()); } + void execute() const { dispatch_execute_task(this, m_iter.m_rp.space()); } inline void execute_task() const { // See [note 1] for an explanation. Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( - m_mdr_policy.space()); + m_iter.m_rp.space()); auto exec = Kokkos::Experimental::HPX::impl_get_executor(); @@ -1087,9 +1086,8 @@ class ParallelFor, for_loop(par.on(exec).with( static_chunk_size(get_hpx_adjusted_chunk_size(m_policy))), - m_policy.begin(), m_policy.end(), [this](const Member i) { - iterate_type(m_mdr_policy, m_functor)(i); - }); + m_policy.begin(), m_policy.end(), + [this](const Member i) { iterate_type(i); }); #elif KOKKOS_HPX_IMPLEMENTATION == 1 using hpx::for_loop_strided; @@ -1101,15 +1099,14 @@ class ParallelFor, const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end()); for (Member i = i_begin; i < i_end; ++i) { - iterate_type(m_mdr_policy, m_functor)(i); + m_iter(i); } }); #endif } inline ParallelFor(const FunctorType &arg_functor, MDRangePolicy arg_policy) - : m_functor(arg_functor), - m_mdr_policy(arg_policy), + : m_iter(arg_policy, arg_functor), m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)) {} template static int max_tile_size_product(const Policy &, const Functor &) { @@ -1406,8 +1403,7 @@ class ParallelReduce, ReducerType, typename Kokkos::Impl::HostIterateTile; - const FunctorType m_functor; - const MDRangePolicy m_mdr_policy; + const iterate_type m_iter; const Policy m_policy; const ReducerType m_reducer; const pointer_type m_result_ptr; @@ -1416,19 +1412,19 @@ class ParallelReduce, ReducerType, public: void execute() const { - dispatch_execute_task(this, m_mdr_policy.space(), m_force_synchronous); + dispatch_execute_task(this, m_iter.m_rp.space(), m_force_synchronous); } inline void execute_task() const { // See [note 1] for an explanation. Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( - m_mdr_policy.space()); + m_iter.m_rp.space()); const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); - const std::size_t value_size = - Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + const std::size_t value_size = Analysis::value_size( + ReducerConditional::select(m_iter.m_func, m_reducer)); - thread_buffer &buffer = m_mdr_policy.space().impl_get_buffer(); + thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer(); buffer.resize(num_worker_threads, value_size); using hpx::for_loop; @@ -1438,7 +1434,7 @@ class ParallelReduce, ReducerType, auto exec = Kokkos::Experimental::HPX::impl_get_executor(); typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + &ReducerConditional::select(m_iter.m_func, m_reducer)); #if KOKKOS_HPX_IMPLEMENTATION == 0 @@ -1454,7 +1450,7 @@ class ParallelReduce, ReducerType, reference_type update = Analysis::Reducer::reference( reinterpret_cast(buffer.get( Kokkos::Experimental::HPX::impl_hardware_thread_id()))); - iterate_type(m_mdr_policy, m_functor, update)(i); + m_iter(i, update); }); #elif KOKKOS_HPX_IMPLEMENTATION == 1 @@ -1477,7 +1473,7 @@ class ParallelReduce, ReducerType, const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end()); for (Member i = i_begin; i < i_end; ++i) { - iterate_type(m_mdr_policy, m_functor, update)(i); + m_iter(i, update); } }); #endif @@ -1491,7 +1487,7 @@ class ParallelReduce, ReducerType, if (m_result_ptr != nullptr) { const int n = Analysis::value_count( - ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_iter.m_func, m_reducer)); for (int j = 0; j < n; ++j) { m_result_ptr[j] = reinterpret_cast(buffer.get(0))[j]; @@ -1506,8 +1502,7 @@ class ParallelReduce, ReducerType, std::enable_if_t::value && !Kokkos::is_reducer::value, void *> = nullptr) - : m_functor(arg_functor), - m_mdr_policy(arg_policy), + : m_iter(arg_policy, arg_functor), m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)), m_reducer(InvalidType()), m_result_ptr(arg_view.data()), @@ -1515,9 +1510,8 @@ class ParallelReduce, ReducerType, inline ParallelReduce(const FunctorType &arg_functor, MDRangePolicy arg_policy, const ReducerType &reducer) - : m_functor(arg_functor), - m_mdr_policy(arg_policy), - m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), + : m_iter(arg_policy, arg_functor), + m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)), m_reducer(reducer), m_result_ptr(reducer.view().data()), m_force_synchronous(!reducer.view().impl_track().has_record()) {} From 6701772fcfaef7f1bc1f1074d513a7ca1a02cdfd Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Thu, 26 Jan 2023 12:24:19 -0700 Subject: [PATCH 117/496] Intel ICE Sacado: use new HostIterateTile API in OpenMP --- core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp | 79 ++++++++++------------ 1 file changed, 34 insertions(+), 45 deletions(-) diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp index 8e3e97b7a2..ebd586a3b2 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp @@ -44,8 +44,8 @@ namespace Kokkos { namespace Impl { -inline bool execute_in_serial() { - return (OpenMP::in_parallel() && +inline bool execute_in_serial(OpenMP const& space = OpenMP()) { + return (OpenMP::in_parallel(space) && !(omp_get_nested() && (omp_get_level() == 1))); } @@ -111,7 +111,7 @@ class ParallelFor, Kokkos::OpenMP> { public: inline void execute() const { - if (execute_in_serial()) { + if (execute_in_serial(m_policy.space())) { exec_range(m_functor, m_policy.begin(), m_policy.end()); return; } @@ -179,15 +179,12 @@ class ParallelFor, MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>; OpenMPInternal* m_instance; - const FunctorType m_functor; - const MDRangePolicy m_mdr_policy; + const iterate_type m_iter; - inline static void exec_range(const MDRangePolicy& mdr_policy, - const FunctorType& functor, const Member ibeg, - const Member iend) { + inline void exec_range(const Member ibeg, const Member iend) const { KOKKOS_PRAGMA_IVDEP_IF_ENABLED for (Member iwork = ibeg; iwork < iend; ++iwork) { - iterate_type(mdr_policy, functor)(iwork); + m_iter(iwork); } } @@ -198,8 +195,8 @@ class ParallelFor, #pragma omp parallel for schedule(dynamic, 1) \ num_threads(m_instance->thread_pool_size()) KOKKOS_PRAGMA_IVDEP_IF_ENABLED - for (index_type iwork = 0; iwork < m_mdr_policy.m_num_tiles; ++iwork) { - iterate_type(m_mdr_policy, m_functor)(iwork); + for (index_type iwork = 0; iwork < m_iter.m_rp.m_num_tiles; ++iwork) { + m_iter(iwork); } } @@ -210,16 +207,15 @@ class ParallelFor, #pragma omp parallel for schedule(static, 1) \ num_threads(m_instance->thread_pool_size()) KOKKOS_PRAGMA_IVDEP_IF_ENABLED - for (index_type iwork = 0; iwork < m_mdr_policy.m_num_tiles; ++iwork) { - iterate_type(m_mdr_policy, m_functor)(iwork); + for (index_type iwork = 0; iwork < m_iter.m_rp.m_num_tiles; ++iwork) { + m_iter(iwork); } } public: inline void execute() const { - if (execute_in_serial()) { - ParallelFor::exec_range(m_mdr_policy, m_functor, 0, - m_mdr_policy.m_num_tiles); + if (execute_in_serial(m_iter.m_rp.space())) { + exec_range(0, m_iter.m_rp.m_num_tiles); return; } @@ -234,7 +230,7 @@ class ParallelFor, { HostThreadTeamData& data = *(m_instance->get_thread_data()); - data.set_work_partition(m_mdr_policy.m_num_tiles, 1); + data.set_work_partition(m_iter.m_rp.m_num_tiles, 1); if (is_dynamic) { // Make sure work partition is set before stealing @@ -247,8 +243,7 @@ class ParallelFor, range = is_dynamic ? data.get_work_stealing_chunk() : data.get_work_partition(); - ParallelFor::exec_range(m_mdr_policy, m_functor, range.first, - range.second); + exec_range(range.first, range.second); } while (is_dynamic && 0 <= range.first); } @@ -257,7 +252,7 @@ class ParallelFor, } inline ParallelFor(const FunctorType& arg_functor, MDRangePolicy arg_policy) - : m_instance(nullptr), m_functor(arg_functor), m_mdr_policy(arg_policy) { + : m_instance(nullptr), m_iter(arg_policy, arg_functor) { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 if (t_openmp_instance) { m_instance = t_openmp_instance; @@ -367,7 +362,7 @@ class ParallelReduce, ReducerType, 0 // thread_local_bytes ); - if (execute_in_serial()) { + if (execute_in_serial(m_policy.space())) { const pointer_type ptr = m_result_ptr ? m_result_ptr @@ -520,23 +515,21 @@ class ParallelReduce, ReducerType, WorkTag, reference_type>; OpenMPInternal* m_instance; - const FunctorType m_functor; - const MDRangePolicy m_mdr_policy; + const iterate_type m_iter; const ReducerType m_reducer; const pointer_type m_result_ptr; - inline static void exec_range(const MDRangePolicy& mdr_policy, - const FunctorType& functor, const Member ibeg, - const Member iend, reference_type update) { + inline void exec_range(const Member ibeg, const Member iend, + reference_type update) const { for (Member iwork = ibeg; iwork < iend; ++iwork) { - iterate_type(mdr_policy, functor, update)(iwork); + m_iter(iwork, update); } } public: inline void execute() const { - const size_t pool_reduce_bytes = - Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + const size_t pool_reduce_bytes = Analysis::value_size( + ReducerConditional::select(m_iter.m_func, m_reducer)); m_instance->acquire_lock(); @@ -548,9 +541,9 @@ class ParallelReduce, ReducerType, ); typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + &ReducerConditional::select(m_iter.m_func, m_reducer)); - if (execute_in_serial()) { + if (execute_in_serial(m_iter.m_rp.space())) { const pointer_type ptr = m_result_ptr ? m_result_ptr @@ -559,8 +552,7 @@ class ParallelReduce, ReducerType, reference_type update = final_reducer.init(ptr); - ParallelReduce::exec_range(m_mdr_policy, m_functor, 0, - m_mdr_policy.m_num_tiles, update); + ParallelReduce::exec_range(0, m_iter.m_rp.m_num_tiles, update); final_reducer.final(ptr); @@ -579,7 +571,7 @@ class ParallelReduce, ReducerType, { HostThreadTeamData& data = *(m_instance->get_thread_data()); - data.set_work_partition(m_mdr_policy.m_num_tiles, 1); + data.set_work_partition(m_iter.m_rp.m_num_tiles, 1); if (is_dynamic) { // Make sure work partition is set before stealing @@ -595,8 +587,7 @@ class ParallelReduce, ReducerType, range = is_dynamic ? data.get_work_stealing_chunk() : data.get_work_partition(); - ParallelReduce::exec_range(m_mdr_policy, m_functor, range.first, - range.second, update); + ParallelReduce::exec_range(range.first, range.second, update); } while (is_dynamic && 0 <= range.first); } @@ -617,7 +608,7 @@ class ParallelReduce, ReducerType, if (m_result_ptr) { const int n = Analysis::value_count( - ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_iter.m_func, m_reducer)); for (int j = 0; j < n; ++j) { m_result_ptr[j] = ptr[j]; @@ -637,8 +628,7 @@ class ParallelReduce, ReducerType, !Kokkos::is_reducer::value, void*> = nullptr) : m_instance(nullptr), - m_functor(arg_functor), - m_mdr_policy(arg_policy), + m_iter(arg_policy, arg_functor), m_reducer(InvalidType()), m_result_ptr(arg_view.data()) { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 @@ -659,8 +649,7 @@ class ParallelReduce, ReducerType, inline ParallelReduce(const FunctorType& arg_functor, MDRangePolicy arg_policy, const ReducerType& reducer) : m_instance(nullptr), - m_functor(arg_functor), - m_mdr_policy(arg_policy), + m_iter(arg_policy, arg_functor), m_reducer(reducer), m_result_ptr(reducer.view().data()) { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 @@ -748,7 +737,7 @@ class ParallelScan, 0 // thread_local_bytes ); - if (execute_in_serial()) { + if (execute_in_serial(m_policy.space())) { typename Analysis::Reducer final_reducer(&m_functor); reference_type update = final_reducer.init( @@ -880,7 +869,7 @@ class ParallelScanWithTotal, 0 // thread_local_bytes ); - if (execute_in_serial()) { + if (execute_in_serial(m_policy.space())) { typename Analysis::Reducer final_reducer(&m_functor); reference_type update = final_reducer.init( @@ -1054,7 +1043,7 @@ class ParallelFor, m_instance->resize_thread_data(pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); - if (execute_in_serial()) { + if (execute_in_serial(m_policy.space())) { ParallelFor::template exec_team( m_functor, *(m_instance->get_thread_data()), 0, m_policy.league_size(), m_policy.league_size()); @@ -1224,7 +1213,7 @@ class ParallelReduce, m_instance->resize_thread_data(pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); - if (execute_in_serial()) { + if (execute_in_serial(m_policy.space())) { HostThreadTeamData& data = *(m_instance->get_thread_data()); pointer_type ptr = m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local()); From 2b5c31a453669b9d5cf72190b4eef0b5df01d98a Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Thu, 26 Jan 2023 12:26:07 -0700 Subject: [PATCH 118/496] Intel ICE Sacado: turn off support for nested OpenMP with ICPC This turns off support for calling parallel_for inside a OpenMP parallel region if nesting is not enabled actually. --- core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp index ebd586a3b2..f818429d68 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp @@ -214,10 +214,12 @@ class ParallelFor, public: inline void execute() const { +#ifndef KOKKOS_COMPILER_INTEL if (execute_in_serial(m_iter.m_rp.space())) { exec_range(0, m_iter.m_rp.m_num_tiles); return; } +#endif #ifndef KOKKOS_INTERNAL_DISABLE_NATIVE_OPENMP execute_parallel(); @@ -543,6 +545,7 @@ class ParallelReduce, ReducerType, typename Analysis::Reducer final_reducer( &ReducerConditional::select(m_iter.m_func, m_reducer)); +#ifndef KOKKOS_COMPILER_INTEL if (execute_in_serial(m_iter.m_rp.space())) { const pointer_type ptr = m_result_ptr @@ -560,6 +563,7 @@ class ParallelReduce, ReducerType, return; } +#endif enum { is_dynamic = std::is_same Date: Thu, 26 Jan 2023 20:43:38 -0500 Subject: [PATCH 119/496] Change `#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_{4 -> 3}` --- core/src/Kokkos_Macros.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 84aeeab70e..f79713eb37 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -451,7 +451,7 @@ //---------------------------------------------------------------------------- // Determine for what space the code is being compiled: -#if defined(KOKKOS_ENABLE_DEPRECARED_CODE_4) +#if defined(KOKKOS_ENABLE_DEPRECARED_CODE_3) #if defined(__CUDACC__) && defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA From f78d87ac84a930e0929ca91f8ba881d5191c6d55 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 26 Jan 2023 14:12:00 -0500 Subject: [PATCH 120/496] Unwire initializing/finalizing Kokkos lock arrays --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 6 ++++-- core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 3 +-- core/src/HIP/Kokkos_HIP.cpp | 3 +-- core/src/HIP/Kokkos_HIP_Instance.cpp | 2 +- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 3 +-- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 364d8c6416..efe6cb3a52 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -400,7 +400,9 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default #endif // Init the array for used for arbitrarily sized atomics - if (this == &singleton()) Impl::initialize_host_cuda_lock_arrays(); + if (this == &singleton()) { + desul::Impl::init_lock_arrays(); // FIXME + } // Allocate a staging buffer for constant mem in pinned host memory // and an event to avoid overwriting driver for previous kernel launches @@ -574,7 +576,7 @@ void CudaInternal::finalize() { // Only finalize this if we're the singleton if (this == &singleton()) { (void)Impl::cuda_global_unique_token_locks(true); - Impl::finalize_host_cuda_lock_arrays(); + desul::Impl::finalize_lock_arrays(); // FIXME KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(constantMemHostStaging)); KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy(constantMemReusable)); diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 45cc6f5d2f..ce6379ff69 100644 --- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -652,7 +651,7 @@ struct CudaParallelLaunchImpl< shmem, desired_occupancy); } - ensure_cuda_lock_arrays_on_device(); + desul::ensure_cuda_lock_arrays_on_device(); // Invoke the driver function on the device base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); diff --git a/core/src/HIP/Kokkos_HIP.cpp b/core/src/HIP/Kokkos_HIP.cpp index 766f815c72..94cd97b2b9 100644 --- a/core/src/HIP/Kokkos_HIP.cpp +++ b/core/src/HIP/Kokkos_HIP.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include @@ -79,7 +78,7 @@ void HIP::impl_initialize(InitializationSettings const& settings) { Impl::HIPInternal::m_maxWavesPerCU * Impl::HIPTraits::WarpSize; // Init the array for used for arbitrarily sized atomics - Impl::initialize_host_hip_lock_arrays(); + desul::Impl::init_lock_arrays(); // FIXME // Allocate a staging buffer for constant mem in pinned host memory // and an event to avoid overwriting driver for previous kernel launches diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 0927a0d99d..7840ad905b 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -327,7 +327,7 @@ void HIPInternal::finalize() { if (this == &singleton()) { (void)Kokkos::Impl::hip_global_unique_token_locks(true); - Impl::finalize_host_hip_lock_arrays(); + desul::Impl::finalize_lock_arrays(); // FIXME KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging)); KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable)); diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 11975fc25b..d387474fe8 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -24,7 +24,6 @@ #include #include #include -#include // Must use global variable on the device with HIP-Clang #ifdef __HIP__ @@ -462,7 +461,7 @@ struct HIPParallelLaunch< "HIPParallelLaunch FAILED: shared memory request is too large"); } - ensure_hip_lock_arrays_on_device(); + desul::ensure_hip_lock_arrays_on_device(); // Invoke the driver function on the device base_t::invoke_kernel(driver, grid, block, shmem, hip_instance); From cd8eb9c10e7ba584e38034bb451a286e26909f59 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 26 Jan 2023 14:12:57 -0500 Subject: [PATCH 121/496] Remove Cuda and HIP lock arrays altogether --- Makefile.targets | 4 - core/src/Cuda/Kokkos_Cuda_Locks.cpp | 89 ---------------- core/src/Cuda/Kokkos_Cuda_Locks.hpp | 153 ---------------------------- core/src/HIP/Kokkos_HIP_Locks.cpp | 84 --------------- core/src/HIP/Kokkos_HIP_Locks.hpp | 144 -------------------------- 5 files changed, 474 deletions(-) delete mode 100644 core/src/Cuda/Kokkos_Cuda_Locks.cpp delete mode 100644 core/src/Cuda/Kokkos_Cuda_Locks.hpp delete mode 100644 core/src/HIP/Kokkos_HIP_Locks.cpp delete mode 100644 core/src/HIP/Kokkos_HIP_Locks.hpp diff --git a/Makefile.targets b/Makefile.targets index 185c7067bd..4e08a46c69 100644 --- a/Makefile.targets +++ b/Makefile.targets @@ -51,8 +51,6 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp -Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp endif @@ -77,8 +75,6 @@ Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp -Kokkos_HIP_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp endif diff --git a/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/core/src/Cuda/Kokkos_Cuda_Locks.cpp deleted file mode 100644 index f20b41cc48..0000000000 --- a/core/src/Cuda/Kokkos_Cuda_Locks.cpp +++ /dev/null @@ -1,89 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include -#ifdef KOKKOS_ENABLE_CUDA -#include -#include - -#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -namespace Kokkos { -namespace Impl { -__device__ __constant__ CudaLockArrays g_device_cuda_lock_arrays = {nullptr, 0}; -} -} // namespace Kokkos -#endif - -namespace Kokkos { - -namespace { - -__global__ void init_lock_array_kernel_atomic() { - unsigned i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < CUDA_SPACE_ATOMIC_MASK + 1) { - Kokkos::Impl::g_device_cuda_lock_arrays.atomic[i] = 0; - } -} - -} // namespace - -namespace Impl { - -CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0}; - -void initialize_host_cuda_lock_arrays() { - desul::Impl::init_lock_arrays(); - desul::ensure_cuda_lock_arrays_on_device(); - - if (g_host_cuda_lock_arrays.atomic != nullptr) return; - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaMalloc(&g_host_cuda_lock_arrays.atomic, - sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1))); - Impl::cuda_device_synchronize( - "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays"); - g_host_cuda_lock_arrays.n = CudaInternal::concurrency(); - copy_cuda_lock_arrays_to_device(); - init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, - 256>>>(); - Impl::cuda_device_synchronize( - "Kokkos::Impl::initialize_host_cuda_lock_arrays: Post Init Lock Arrays"); -} - -void finalize_host_cuda_lock_arrays() { - desul::Impl::finalize_lock_arrays(); - - if (g_host_cuda_lock_arrays.atomic == nullptr) return; - cudaFree(g_host_cuda_lock_arrays.atomic); - g_host_cuda_lock_arrays.atomic = nullptr; - g_host_cuda_lock_arrays.n = 0; -#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - copy_cuda_lock_arrays_to_device(); -#endif -} - -} // namespace Impl - -} // namespace Kokkos - -#else - -void KOKKOS_CORE_SRC_CUDA_CUDA_LOCKS_PREVENT_LINK_ERROR() {} - -#endif diff --git a/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/core/src/Cuda/Kokkos_Cuda_Locks.hpp deleted file mode 100644 index 08f88895e2..0000000000 --- a/core/src/Cuda/Kokkos_Cuda_Locks.hpp +++ /dev/null @@ -1,153 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_CUDA_LOCKS_HPP -#define KOKKOS_CUDA_LOCKS_HPP - -#include - -#ifdef KOKKOS_ENABLE_CUDA - -#include - -#include - -// FIXME do not include private headers -#include - -namespace Kokkos { -namespace Impl { - -struct CudaLockArrays { - std::int32_t* atomic; - std::int32_t n; -}; - -/// \brief This global variable in Host space is the central definition -/// of these arrays. -extern CudaLockArrays g_host_cuda_lock_arrays; - -/// \brief After this call, the g_host_cuda_lock_arrays variable has -/// valid, initialized arrays. -/// -/// This call is idempotent. -void initialize_host_cuda_lock_arrays(); - -/// \brief After this call, the g_host_cuda_lock_arrays variable has -/// all null pointers, and all array memory has been freed. -/// -/// This call is idempotent. -void finalize_host_cuda_lock_arrays(); - -} // namespace Impl -} // namespace Kokkos - -namespace Kokkos { -namespace Impl { - -/// \brief This global variable in CUDA space is what kernels use -/// to get access to the lock arrays. -/// -/// When relocatable device code is enabled, there can be one single -/// instance of this global variable for the entire executable, -/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration -/// here must then be extern. -/// This one instance will be initialized by initialize_host_cuda_lock_arrays -/// and need not be modified afterwards. -/// -/// When relocatable device code is disabled, an instance of this variable -/// will be created in every translation unit that sees this header file -/// (we make this clear by marking it static, meaning no other translation -/// unit can link to it). -/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the -/// instances in other translation units, we must update this CUDA global -/// variable based on the Host global variable prior to running any kernels -/// that will use it. -/// That is the purpose of the ensure_cuda_lock_arrays_on_device function. -__device__ -#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - __constant__ extern -#endif - CudaLockArrays g_device_cuda_lock_arrays; - -#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF - -/// \brief Acquire a lock for the address -/// -/// This function tries to acquire the lock for the hash value derived -/// from the provided ptr. If the lock is successfully acquired the -/// function returns true. Otherwise it returns false. -__device__ inline bool lock_address_cuda_space(void* ptr) { - size_t offset = size_t(ptr); - offset = offset >> 2; - offset = offset & CUDA_SPACE_ATOMIC_MASK; - return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1)); -} - -/// \brief Release lock for the address -/// -/// This function releases the lock for the hash value derived -/// from the provided ptr. This function should only be called -/// after previously successfully acquiring a lock with -/// lock_address. -__device__ inline void unlock_address_cuda_space(void* ptr) { - size_t offset = size_t(ptr); - offset = offset >> 2; - offset = offset & CUDA_SPACE_ATOMIC_MASK; - atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0); -} - -} // namespace Impl -} // namespace Kokkos - -// Make lock_array_copied an explicit translation unit scope thingy -namespace Kokkos { -namespace Impl { -namespace { -static int lock_array_copied = 0; -} // namespace - -#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -inline -#else -inline static -#endif - void - copy_cuda_lock_arrays_to_device() { - if (lock_array_copied == 0) { - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays, - &g_host_cuda_lock_arrays, - sizeof(CudaLockArrays))); - } - lock_array_copied = 1; -} - -#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -inline void ensure_cuda_lock_arrays_on_device() {} -#else -// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc. -inline static void ensure_cuda_lock_arrays_on_device() { - copy_cuda_lock_arrays_to_device(); - desul::ensure_cuda_lock_arrays_on_device(); -} -#endif - -} // namespace Impl -} // namespace Kokkos - -#endif /* defined( KOKKOS_ENABLE_CUDA ) */ - -#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */ diff --git a/core/src/HIP/Kokkos_HIP_Locks.cpp b/core/src/HIP/Kokkos_HIP_Locks.cpp deleted file mode 100644 index 3547286236..0000000000 --- a/core/src/HIP/Kokkos_HIP_Locks.cpp +++ /dev/null @@ -1,84 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include -#include -#include -#include - -#include - -#include - -namespace Kokkos { - -#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE -namespace Impl { -__device__ __constant__ HIPLockArrays g_device_hip_lock_arrays = {nullptr, 0}; -} -#endif - -namespace { - -__global__ void init_lock_array_kernel_atomic() { - unsigned i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1) { - Kokkos::Impl::g_device_hip_lock_arrays.atomic[i] = 0; - } -} - -} // namespace - -namespace Impl { - -HIPLockArrays g_host_hip_lock_arrays = {nullptr, 0}; - -void initialize_host_hip_lock_arrays() { - desul::Impl::init_lock_arrays(); - desul::ensure_hip_lock_arrays_on_device(); - - if (g_host_hip_lock_arrays.atomic != nullptr) return; - KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc( - &g_host_hip_lock_arrays.atomic, - sizeof(std::int32_t) * (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1))); - - g_host_hip_lock_arrays.n = HIPInternal::concurrency(); - copy_hip_lock_arrays_to_device(); - init_lock_array_kernel_atomic<<< - (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256, 0, nullptr>>>(); -} - -void finalize_host_hip_lock_arrays() { - desul::Impl::finalize_lock_arrays(); - - if (g_host_hip_lock_arrays.atomic == nullptr) return; - KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic)); - g_host_hip_lock_arrays.atomic = nullptr; - g_host_hip_lock_arrays.n = 0; -#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE - copy_hip_lock_arrays_to_device(); -#endif -} - -} // namespace Impl - -} // namespace Kokkos diff --git a/core/src/HIP/Kokkos_HIP_Locks.hpp b/core/src/HIP/Kokkos_HIP_Locks.hpp deleted file mode 100644 index e2ea06c11f..0000000000 --- a/core/src/HIP/Kokkos_HIP_Locks.hpp +++ /dev/null @@ -1,144 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_HIP_LOCKS_HPP -#define KOKKOS_HIP_LOCKS_HPP - -#include - -#include - -#include - -// FIXME do not include private headers -#include - -namespace Kokkos { -namespace Impl { - -struct HIPLockArrays { - std::int32_t* atomic; - std::int32_t n; -}; - -/// \brief This global variable in Host space is the central definition -/// of these arrays. -extern HIPLockArrays g_host_hip_lock_arrays; - -/// \brief After this call, the g_host_hip_lock_arrays variable has -/// valid, initialized arrays. -/// -/// This call is idempotent. -void initialize_host_hip_lock_arrays(); - -/// \brief After this call, the g_host_hip_lock_arrays variable has -/// all null pointers, and all array memory has been freed. -/// -/// This call is idempotent. -void finalize_host_hip_lock_arrays(); - -#if defined(__HIPCC__) - -/// \brief This global variable in HIP space is what kernels use -/// to get access to the lock arrays. -/// -/// When relocatable device code is enabled, there can be one single -/// instance of this global variable for the entire executable, -/// whose definition will be in Kokkos_HIP_Locks.cpp (and whose declaration -/// here must then be extern). -/// This one instance will be initialized by initialize_host_hip_lock_arrays -/// and need not be modified afterwards. -/// -/// When relocatable device code is disabled, an instance of this variable -/// will be created in every translation unit that sees this header file. -/// Since the Kokkos_HIP_Locks.cpp translation unit cannot initialize the -/// instances in other translation units, we must update this HIP global -/// variable based on the Host global variable prior to running any kernels -/// that will use it. -/// That is the purpose of the ensure_hip_lock_arrays_on_device function. -__device__ -#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE - __constant__ extern -#endif - HIPLockArrays g_device_hip_lock_arrays; - -#define KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK 0x1FFFF - -/// \brief Acquire a lock for the address -/// -/// This function tries to acquire the lock for the hash value derived -/// from the provided ptr. If the lock is successfully acquired the -/// function returns true. Otherwise it returns false. -__device__ inline bool lock_address_hip_space(void* ptr) { - auto offset = reinterpret_cast(ptr); - offset = offset >> 2; - offset = offset & KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK; - return (0 == atomicCAS(&g_device_hip_lock_arrays.atomic[offset], 0, 1)); -} - -/// \brief Release lock for the address -/// -/// This function releases the lock for the hash value derived -/// from the provided ptr. This function should only be called -/// after previously successfully acquiring a lock with -/// lock_address. -__device__ inline void unlock_address_hip_space(void* ptr) { - auto offset = reinterpret_cast(ptr); - offset = offset >> 2; - offset = offset & KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK; - atomicExch(&g_device_hip_lock_arrays.atomic[offset], 0); -} - -} // namespace Impl -} // namespace Kokkos - -// Make lock_array_copied an explicit translation unit scope thingy -namespace Kokkos { -namespace Impl { -namespace { -static int lock_array_copied = 0; -} // namespace - -#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE -inline -#else -inline static -#endif - void - copy_hip_lock_arrays_to_device() { - if (lock_array_copied == 0) { - KOKKOS_IMPL_HIP_SAFE_CALL( - hipMemcpyToSymbol(HIP_SYMBOL(g_device_hip_lock_arrays), - &g_host_hip_lock_arrays, sizeof(HIPLockArrays))); - } - lock_array_copied = 1; -} - -#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE -inline void ensure_hip_lock_arrays_on_device() {} -#else -inline static void ensure_hip_lock_arrays_on_device() { - copy_hip_lock_arrays_to_device(); - desul::ensure_hip_lock_arrays_on_device(); -} -#endif - -} // namespace Impl -} // namespace Kokkos - -#endif /* defined( __HIPCC__ ) */ - -#endif /* #ifndef KOKKOS_HIP_LOCKS_HPP */ From eabd0e445f0b909269d46c1c6cd03839762d30a3 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 27 Jan 2023 10:37:36 -0500 Subject: [PATCH 122/496] Disable global device variables in SYCL+Cuda CI --- .jenkins | 1 + 1 file changed, 1 insertion(+) diff --git a/.jenkins b/.jenkins index b9ece2dce2..20dfae53df 100644 --- a/.jenkins +++ b/.jenkins @@ -107,6 +107,7 @@ pipeline { -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-linker-warnings" \ + -DKOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED=0 \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ARCH_VOLTA70=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ From 9d7257ad98339580e0d0d647b3d888c94d0325c1 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Sun, 29 Jan 2023 08:35:45 -0500 Subject: [PATCH 123/496] Fixup turns out Tpetra "abs max" operation does not preserve the sign --- core/unit_test/TestAtomics.hpp | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/core/unit_test/TestAtomics.hpp b/core/unit_test/TestAtomics.hpp index 0d87bc3b79..4491893c4c 100644 --- a/core/unit_test/TestAtomics.hpp +++ b/core/unit_test/TestAtomics.hpp @@ -542,19 +542,20 @@ TEST(TEST_CATEGORY, atomics) { // see https://github.com/trilinos/Trilinos/pull/11506 struct TpetraUseCase { template - struct WrapScalarAndCompareAbsMax { + struct AbsMaxHelper { Scalar value; - private: - friend KOKKOS_FUNCTION bool operator<( - WrapScalarAndCompareAbsMax const& lhs, - WrapScalarAndCompareAbsMax const& rhs) { - return Kokkos::abs(lhs.value) < Kokkos::abs(rhs.value); + KOKKOS_FUNCTION AbsMaxHelper& operator+=(AbsMaxHelper const& rhs) { + Scalar lhs_abs_value = Kokkos::abs(value); + Scalar rhs_abs_value = Kokkos::abs(rhs.value); + value = lhs_abs_value > rhs_abs_value ? lhs_abs_value : rhs_abs_value; + return *this; } - friend KOKKOS_FUNCTION bool operator>( - WrapScalarAndCompareAbsMax const& lhs, - WrapScalarAndCompareAbsMax const& rhs) { - return Kokkos::abs(lhs.value) > Kokkos::abs(rhs.value); + + KOKKOS_FUNCTION AbsMaxHelper operator+(AbsMaxHelper const& rhs) const { + AbsMaxHelper ret = *this; + ret += rhs; + return ret; } }; @@ -564,8 +565,8 @@ struct TpetraUseCase { // 0, -1, 2, -3, ... auto v_i = static_cast(i); if (i % 2 == 1) v_i = -v_i; - Kokkos::atomic_max(reinterpret_cast*>(&d_()), - WrapScalarAndCompareAbsMax{v_i}); + Kokkos::atomic_add(reinterpret_cast*>(&d_()), + AbsMaxHelper{v_i}); } TpetraUseCase() { Kokkos::parallel_for(10, *this); } @@ -573,7 +574,7 @@ struct TpetraUseCase { void check() { T v; Kokkos::deep_copy(v, d_); - ASSERT_EQ(v, -9); + ASSERT_EQ(v, 9); } }; From ecd23e4ac1a3b0558a3df8a6930eac960bf88943 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 26 Jan 2023 16:16:36 -0500 Subject: [PATCH 124/496] Spell out Kokkos::ALL_t to avoid deprecation warnings --- core/src/Kokkos_CopyViews.hpp | 66 +++++++++++++++------------- core/src/impl/Kokkos_ViewMapping.hpp | 12 ++--- 2 files changed, 43 insertions(+), 35 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index e02cbee589..98a28646c4 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -874,7 +874,7 @@ struct ViewRemap { } else { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1); view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub); @@ -883,7 +883,7 @@ struct ViewRemap { if (dst.extent(1) == src.extent(1)) { p_type ext0(0, std::min(dst.extent(0), src.extent(0))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, ext0, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub); @@ -915,7 +915,8 @@ struct ViewRemap { if (dst.extent(2) == src.extent(2)) { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -924,7 +925,7 @@ struct ViewRemap { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); p_type ext2(0, std::min(dst.extent(2), src.extent(2))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2); view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub); @@ -934,7 +935,7 @@ struct ViewRemap { p_type ext0(0, std::min(dst.extent(0), src.extent(0))); p_type ext1(0, std::min(dst.extent(1), src.extent(1))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, ext0, ext1, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub); @@ -968,7 +969,8 @@ struct ViewRemap { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); p_type ext2(0, std::min(dst.extent(2), src.extent(2))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -978,7 +980,8 @@ struct ViewRemap { p_type ext2(0, std::min(dst.extent(2), src.extent(2))); p_type ext3(0, std::min(dst.extent(3), src.extent(3))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3); view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub); @@ -988,8 +991,8 @@ struct ViewRemap { p_type ext0(0, std::min(dst.extent(0), src.extent(0))); p_type ext1(0, std::min(dst.extent(1), src.extent(1))); p_type ext2(0, std::min(dst.extent(2), src.extent(2))); - using sv_adapter_type = - CommonSubview; + using sv_adapter_type = CommonSubview; sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub); @@ -1024,8 +1027,9 @@ struct ViewRemap { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); p_type ext2(0, std::min(dst.extent(2), src.extent(2))); p_type ext3(0, std::min(dst.extent(3), src.extent(3))); - using sv_adapter_type = CommonSubview; + using sv_adapter_type = + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1035,8 +1039,9 @@ struct ViewRemap { p_type ext2(0, std::min(dst.extent(2), src.extent(2))); p_type ext3(0, std::min(dst.extent(3), src.extent(3))); p_type ext4(0, std::min(dst.extent(4), src.extent(4))); - using sv_adapter_type = CommonSubview; + using sv_adapter_type = + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4); view_copy(exec_space..., common_subview.dst_sub, @@ -1048,8 +1053,9 @@ struct ViewRemap { p_type ext1(0, std::min(dst.extent(1), src.extent(1))); p_type ext2(0, std::min(dst.extent(2), src.extent(2))); p_type ext3(0, std::min(dst.extent(3), src.extent(3))); - using sv_adapter_type = CommonSubview; + using sv_adapter_type = + CommonSubview; sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1087,8 +1093,8 @@ struct ViewRemap { p_type ext3(0, std::min(dst.extent(3), src.extent(3))); p_type ext4(0, std::min(dst.extent(4), src.extent(4))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1100,8 +1106,8 @@ struct ViewRemap { p_type ext4(0, std::min(dst.extent(4), src.extent(4))); p_type ext5(0, std::min(dst.extent(5), src.extent(5))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4, ext5); view_copy(exec_space..., common_subview.dst_sub, @@ -1117,7 +1123,7 @@ struct ViewRemap { using sv_adapter_type = CommonSubview; + p_type, Kokkos::ALL_t>; sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1161,8 +1167,8 @@ struct ViewRemap { p_type ext4(0, std::min(dst.extent(4), src.extent(4))); p_type ext5(0, std::min(dst.extent(5), src.extent(5))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4, ext5, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1175,8 +1181,8 @@ struct ViewRemap { p_type ext5(0, std::min(dst.extent(5), src.extent(5))); p_type ext6(0, std::min(dst.extent(6), src.extent(6))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4, ext5, ext6); view_copy(exec_space..., common_subview.dst_sub, @@ -1192,7 +1198,7 @@ struct ViewRemap { p_type ext5(0, std::min(dst.extent(5), src.extent(5))); using sv_adapter_type = CommonSubview; + p_type, p_type, Kokkos::ALL_t>; sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4, ext5, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1237,8 +1243,8 @@ struct ViewRemap { p_type ext5(0, std::min(dst.extent(5), src.extent(5))); p_type ext6(0, std::min(dst.extent(6), src.extent(6))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4, ext5, ext6, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, @@ -1252,8 +1258,8 @@ struct ViewRemap { p_type ext6(0, std::min(dst.extent(6), src.extent(6))); p_type ext7(0, std::min(dst.extent(7), src.extent(7))); using sv_adapter_type = - CommonSubview; + CommonSubview; sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, ext4, ext5, ext6, ext7); view_copy(exec_space..., common_subview.dst_sub, @@ -1270,7 +1276,7 @@ struct ViewRemap { p_type ext6(0, std::min(dst.extent(6), src.extent(6))); using sv_adapter_type = CommonSubview; + p_type, p_type, p_type, Kokkos::ALL_t>; sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4, ext5, ext6, Kokkos::ALL); view_copy(exec_space..., common_subview.dst_sub, diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index be67f64f09..28b25eff68 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -308,7 +308,7 @@ namespace Impl { template struct is_integral_extent_type { - enum : bool { value = std::is_same::value ? 1 : 0 }; + enum : bool { value = std::is_same::value ? 1 : 0 }; }; template @@ -357,7 +357,8 @@ struct SubviewLegalArgsCompileTime::value)) || ((CurrentArg >= RankDest) && (std::is_integral::value)) || - ((CurrentArg < RankDest) && (std::is_same::value)) || + ((CurrentArg < RankDest) && + (std::is_same::value)) || ((CurrentArg == 0) && (Kokkos::Impl::is_integral_extent_type::value))) && (SubviewLegalArgsCompileTime::value)) || ((CurrentArg >= RankSrc - RankDest) && - (std::is_same::value))) && + (std::is_same::value))) && (SubviewLegalArgsCompileTime::value) @@ -399,7 +400,8 @@ template struct SubviewLegalArgsCompileTime { enum { - value = ((CurrentArg == RankSrc - 1) && (std::is_same::value)) + value = ((CurrentArg == RankSrc - 1) && + (std::is_same::value)) }; }; @@ -465,7 +467,7 @@ struct SubviewExtents { KOKKOS_FORCEINLINE_FUNCTION bool set(unsigned domain_rank, unsigned range_rank, const ViewDimension& dim, - ALL_t, Args... args) { + Kokkos::ALL_t, Args... args) { m_begin[domain_rank] = 0; m_length[range_rank] = dim.extent(domain_rank); m_index[range_rank] = domain_rank; From e8381d8cee56bd02f05f7bb8decffb01cba6e309 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 30 Jan 2023 09:22:58 -0500 Subject: [PATCH 125/496] Add TODO comment to replace fully-qualified name when possible --- core/src/impl/Kokkos_ViewMapping.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 28b25eff68..9dc83437e9 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -297,6 +297,9 @@ struct ALL_t { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 namespace Impl { +// TODO This alias declaration forces us to fully qualify ALL_t inside the +// Kokkos::Impl namespace to avoid deprecation warnings. Replace the +// fully-qualified name when we remove Kokkos::Impl::ALL_t. using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = Kokkos::ALL_t; } // namespace Impl From a564953aa275970bbd30d992ee3c7ba2f939920f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 30 Jan 2023 14:33:42 -0500 Subject: [PATCH 126/496] Desul atomics: let pointer to the device lock arrays (HIP and CUDA) be in constant memory without RDC as well --- tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp | 10 ++++------ tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp | 10 ++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp b/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp index 6984ae34a7..e514061ed0 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp @@ -69,17 +69,15 @@ namespace Impl { /// variable based on the Host global variable prior to running any kernels /// that will use it. /// That is the purpose of the ensure_cuda_lock_arrays_on_device function. -__device__ #ifdef __CUDACC_RDC__ - __constant__ extern +extern #endif - int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE; + __device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE; -__device__ #ifdef __CUDACC_RDC__ - __constant__ extern +extern #endif - int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE; + __device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE; #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF diff --git a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp index 47c0e8c680..33450e32ec 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp @@ -69,17 +69,15 @@ namespace Impl { * will use it. That is the purpose of the * ensure_hip_lock_arrays_on_device function. */ -__device__ #ifdef DESUL_HIP_RDC - __constant__ extern +extern #endif - int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE; + __device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE; -__device__ #ifdef DESUL_HIP_RDC - __constant__ extern +extern #endif - int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE; + __device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE; #define HIP_SPACE_ATOMIC_MASK 0x1FFFF From 04e3437f7b2227fd511908271ba9bafaf71c534f Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Mon, 30 Jan 2023 15:26:11 -0500 Subject: [PATCH 127/496] Further update to CUDA occupancy calculation (#5739) * Update the occupancy calculation to reflect the maximum number of registers per SM, subject to the warp allocation granularity of 4 warps per SM * Addressed PR comments * Propagated warp allocation constraints to occupancy control functions * Reduce occupancy-related code-reuse in CUDA Parallel MDRange * Comment cleanup * clang-format * Check per-kernel perf, not end-to-end... * Update core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp Co-authored-by: Damien L-G * Apply suggestions from code review Co-authored-by: Damien L-G * Clarified comments, fixed compile error in suggestion that I missed * clang-format --------- Co-authored-by: Damien L-G --- .../Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp | 66 ++++++++++++++++++- core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 14 +++- .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 53 ++++++++------- 3 files changed, 106 insertions(+), 27 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp index a471cd380c..75c1686dc2 100644 --- a/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +++ b/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -25,17 +25,67 @@ namespace Kokkos { namespace Impl { +inline int cuda_warp_per_sm_allocation_granularity( + cudaDeviceProp const& properties) { + // Allocation granularity of warps in each sm + switch (properties.major) { + case 3: + case 5: + case 7: + case 8: + case 9: return 4; + case 6: return (properties.minor == 0 ? 2 : 4); + default: + throw_runtime_exception( + "Unknown device in cuda warp per sm allocation granularity"); + return 0; + } +} + +inline int cuda_max_warps_per_sm_registers( + cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) { + // Maximum number of warps per sm as a function of register counts, + // subject to the constraint that warps are allocated with a fixed granularity + int const max_regs_per_block = properties.regsPerBlock; + int const regs_per_warp = attributes.numRegs * properties.warpSize; + int const warp_granularity = + cuda_warp_per_sm_allocation_granularity(properties); + // The granularity of register allocation is chunks of 256 registers per warp, + // which implies a need to over-allocate, so we round up + int const allocated_regs_per_warp = 256 * ((regs_per_warp + 256 - 1) / 256); + + // The maximum number of warps per SM is constrained from above by register + // allocation. To satisfy the constraint that warps per SM is allocated at a + // finite granularity, we need to round down. + int const max_warps_per_sm = + warp_granularity * + (max_regs_per_block / (allocated_regs_per_warp * warp_granularity)); + + return max_warps_per_sm; +} + inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties, cudaFuncAttributes const& attributes, int block_size, size_t dynamic_shmem) { - // Limits due do registers/SM + // Limits due to registers/SM int const regs_per_sm = properties.regsPerMultiprocessor; int const regs_per_thread = attributes.numRegs; // The granularity of register allocation is chunks of 256 registers per warp // -> 8 registers per thread int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); - int const max_blocks_regs = - regs_per_sm / (allocated_regs_per_thread * block_size); + int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size); + + // Compute the maximum number of warps as a function of the number of + // registers + int const max_warps_per_sm_registers = + cuda_max_warps_per_sm_registers(properties, attributes); + + // Correct the number of blocks to respect the maximum number of warps per + // SM, which is constrained to be a multiple of the warp allocation + // granularity defined in `cuda_warp_per_sm_allocation_granularity`. + while ((max_blocks_regs * block_size / properties.warpSize) > + max_warps_per_sm_registers) + max_blocks_regs--; // Limits due to shared memory/SM size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor; @@ -179,6 +229,16 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, LaunchBounds{}); } +// Thin version of cuda_get_opt_block_size for cases where there is no shared +// memory +template +int cuda_get_opt_block_size_no_shmem(const cudaDeviceProp& prop, + const cudaFuncAttributes& attr, + LaunchBounds) { + return cuda_deduce_block_size( + false, prop, attr, [](int /*block_size*/) { return 0; }, LaunchBounds{}); +} + } // namespace Impl } // namespace Kokkos diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 45cc6f5d2f..5afad7a6a3 100644 --- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -160,9 +160,21 @@ inline void configure_shmem_preference(const KernelFuncPtr& func, // The granularity of register allocation is chunks of 256 registers per warp // -> 8 registers per thread const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); - const size_t max_blocks_regs = + size_t max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size); + // Compute the maximum number of warps as a function of the number of + // registers + const size_t max_warps_per_sm_registers = + cuda_max_warps_per_sm_registers(device_props, func_attr); + + // Correct the number of blocks to respect the maximum number of warps per + // SM, which is constrained to be a multiple of the warp allocation + // granularity defined in `cuda_warp_per_sm_allocation_granularity`. + while ((max_blocks_regs * block_size / device_props.warpSize) > + max_warps_per_sm_registers) + max_blocks_regs--; + // Compute how many threads per sm we actually want const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor; // only allocate multiples of warp size diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 356f4b2cd1..0015d1ea14 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -39,6 +39,34 @@ namespace Kokkos { namespace Impl { +template +int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) { + cudaFuncAttributes attr = + CudaParallelLaunch::get_cuda_func_attributes(); + auto const& prop = pol.space().cuda_device_prop(); + + // Limits due to registers/SM, MDRange doesn't have + // shared memory constraints + int const optimal_block_size = + cuda_get_opt_block_size_no_shmem(prop, attr, LaunchBounds{}); + + // Compute how many blocks of this size we can launch, based on warp + // constraints + int const max_warps_per_sm_registers = + Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr); + int const max_num_threads_from_warps = + max_warps_per_sm_registers * prop.warpSize; + int const max_num_blocks = max_num_threads_from_warps / optimal_block_size; + + // Compute the total number of threads + int const max_threads_per_sm = optimal_block_size * max_num_blocks; + + return std::min( + max_threads_per_sm, + static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); +} + template class ParallelFor, Kokkos::Cuda> { public: @@ -57,18 +85,7 @@ class ParallelFor, Kokkos::Cuda> { public: template static int max_tile_size_product(const Policy& pol, const Functor&) { - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); - auto const& prop = pol.space().cuda_device_prop(); - // Limits due to registers/SM, MDRange doesn't have - // shared memory constraints - int const regs_per_sm = prop.regsPerMultiprocessor; - int const regs_per_thread = attr.numRegs; - int const max_threads_per_sm = regs_per_sm / regs_per_thread; - return std::min( - max_threads_per_sm, - static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); + return max_tile_size_product_helper(pol, LaunchBounds{}); } Policy const& get_policy() const { return m_rp; } inline __device__ void operator()() const { @@ -230,17 +247,7 @@ class ParallelReduce, ReducerType, public: template static int max_tile_size_product(const Policy& pol, const Functor&) { - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); - auto const& prop = pol.space().cuda_device_prop(); - // Limits due do registers/SM - int const regs_per_sm = prop.regsPerMultiprocessor; - int const regs_per_thread = attr.numRegs; - int const max_threads_per_sm = regs_per_sm / regs_per_thread; - return std::min( - max_threads_per_sm, - static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); + return max_tile_size_product_helper(pol, LaunchBounds{}); } Policy const& get_policy() const { return m_policy; } inline __device__ void exec_range(reference_type update) const { From fb7d9f23f0ce52e80422fc3f34330afd445871f8 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 30 Jan 2023 17:56:43 -0500 Subject: [PATCH 128/496] SYCL: Pass Xsycl-target-backend* only to the linker (#5705) * SYCL: Pass Xsycl-target-backend* only to the linker * Revert changes for SYCL+Cuda --- cmake/kokkos_arch.cmake | 51 ++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index a74b851a4e..b60215e60a 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -742,30 +742,35 @@ IF (KOKKOS_ENABLE_SYCL) COMPILER_SPECIFIC_FLAGS( DEFAULT -fsycl-targets=spir64 ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen11" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device dg1" - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.50.4" - ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.60.7" + ELSE() + COMPILER_SPECIFIC_OPTIONS( + DEFAULT -fsycl-targets=spir64_gen ) + IF(KOKKOS_ARCH_INTEL_GEN9) + COMPILER_SPECIFIC_LINK_OPTIONS( + DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN11) + COMPILER_SPECIFIC_LINK_OPTIONS( + DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen11" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) + COMPILER_SPECIFIC_LINK_OPTIONS( + DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" + ) + ELSEIF(KOKKOS_ARCH_INTEL_DG1) + COMPILER_SPECIFIC_LINK_OPTIONS( + DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device dg1" + ) + ELSEIF(KOKKOS_ARCH_INTEL_XEHP) + COMPILER_SPECIFIC_LINK_OPTIONS( + DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.50.4" + ) + ELSEIF(KOKKOS_ARCH_INTEL_PVC) + COMPILER_SPECIFIC_LINK_OPTIONS( + DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.60.7" + ) + ENDIF() ENDIF() ENDIF() From 8103d826369da9ec62c3c3ffaf530e6a095d2392 Mon Sep 17 00:00:00 2001 From: Dan Ibanez Date: Mon, 30 Jan 2023 16:07:20 -0700 Subject: [PATCH 129/496] SIMD backend of ARM NEON (#5775) * in the process of adding a NEON backend * more work on NEON * up to double where expressions * finished first draft! * working on compiling NEON backend * got NEON backend to compile * formatting * actually test NEON and test conversions conversions are missing right now * test and fix vsetq_lane_ usage * implement and test shifts * implement and test a mask conversion * test and implement i32 -> u64 * test and implement u64 -> i32 * test and fix final conversion * formatting * consolidate NEON mask types there are really only two implementations: masks for 64-bit and 32-bit value types. use a bit of CRTP to ensure return types of operators are correct * formatting * move converting constructors * add missing nodiscard * add unary negation for 32-bit signed integer * add 64-bit signed addition and move unary negation * replace all vdup with vmov as far as I can tell they're exactly identical, except that there are some vmov intrinsics that don't have vdup equivalents, so vmov seems to just be the better one to use * ensure all the condition methods are [[nodiscard]] * add subtraction and addition for 64bit uint * formatting --- simd/src/Kokkos_SIMD.hpp | 8 + simd/src/Kokkos_SIMD_Common.hpp | 28 + simd/src/Kokkos_SIMD_NEON.hpp | 995 ++++++++++++++++++++++++++++++++ simd/unit_tests/TestSIMD.cpp | 121 +++- 4 files changed, 1151 insertions(+), 1 deletion(-) create mode 100644 simd/src/Kokkos_SIMD_NEON.hpp diff --git a/simd/src/Kokkos_SIMD.hpp b/simd/src/Kokkos_SIMD.hpp index e5d54b0ff1..9280763407 100644 --- a/simd/src/Kokkos_SIMD.hpp +++ b/simd/src/Kokkos_SIMD.hpp @@ -29,6 +29,10 @@ #include #endif +#ifdef __ARM_NEON +#include +#endif + namespace Kokkos { namespace Experimental { @@ -40,6 +44,8 @@ namespace Impl { using host_native = avx512_fixed_size<8>; #elif defined(KOKKOS_ARCH_AVX2) using host_native = avx2_fixed_size<4>; +#elif defined(__ARM_NEON) +using host_native = neon_fixed_size<2>; #else using host_native = scalar; #endif @@ -134,6 +140,8 @@ class abi_set {}; using host_abi_set = abi_set>; #elif defined(KOKKOS_ARCH_AVX2) using host_abi_set = abi_set>; +#elif defined(__ARM_NEON) +using host_abi_set = abi_set>; #else using host_abi_set = abi_set; #endif diff --git a/simd/src/Kokkos_SIMD_Common.hpp b/simd/src/Kokkos_SIMD_Common.hpp index 9b2c0f81d7..c29d49fb3a 100644 --- a/simd/src/Kokkos_SIMD_Common.hpp +++ b/simd/src/Kokkos_SIMD_Common.hpp @@ -136,6 +136,34 @@ template return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); } +// fallback simd shift using generator constructor +// At the time of this writing, these fallbacks are only used +// to shift vectors of 64-bit unsigned integers for the NEON backend + +template +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>( + simd const& lhs, unsigned int rhs) { + return simd([&](std::size_t i) { return lhs[i] >> rhs; }); +} + +template +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<( + simd const& lhs, unsigned int rhs) { + return simd([&](std::size_t i) { return lhs[i] << rhs; }); +} + +template +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>( + simd const& lhs, simd const& rhs) { + return simd([&](std::size_t i) { return lhs[i] >> rhs[i]; }); +} + +template +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<( + simd const& lhs, simd const& rhs) { + return simd([&](std::size_t i) { return lhs[i] << rhs[i]; }); +} + // The code below provides: // operator@(simd, Arithmetic) // operator@(Arithmetic, simd) diff --git a/simd/src/Kokkos_SIMD_NEON.hpp b/simd/src/Kokkos_SIMD_NEON.hpp new file mode 100644 index 0000000000..2473004098 --- /dev/null +++ b/simd/src/Kokkos_SIMD_NEON.hpp @@ -0,0 +1,995 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SIMD_NEON_HPP +#define KOKKOS_SIMD_NEON_HPP + +#include +#include + +#include + +#include + +namespace Kokkos { + +namespace Experimental { + +namespace simd_abi { + +template +class neon_fixed_size {}; + +} // namespace simd_abi + +namespace Impl { + +template +class neon_mask; + +template +class neon_mask { + uint64x2_t m_value; + + public: + class reference { + uint64x2_t& m_mask; + int m_lane; + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(uint64x2_t& mask_arg, + int lane_arg) + : m_mask(mask_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(bool value) const { + // this switch statement is needed because the lane argument has to be a + // constant + switch (m_lane) { + case 0: + m_mask = vsetq_lane_u64(value ? 0xFFFFFFFFFFFFFFFFULL : 0, m_mask, 0); + break; + case 1: + m_mask = vsetq_lane_u64(value ? 0xFFFFFFFFFFFFFFFFULL : 0, m_mask, 1); + break; + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator bool() const { + switch (m_lane) { + case 0: return vgetq_lane_u64(m_mask, 0) != 0; + case 1: return vgetq_lane_u64(m_mask, 1) != 0; + } + return false; + } + }; + using value_type = bool; + using abi_type = simd_abi::neon_fixed_size<2>; + using implementation_type = uint64x2_t; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit neon_mask(value_type value) + : m_value(vmovq_n_u64(value ? 0xFFFFFFFFFFFFFFFFULL : 0)) {} + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask( + neon_mask const& other) { + operator[](0) = bool(other[0]); + operator[](1) = bool(other[1]); + } + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask(neon_mask const& other) + : neon_mask(static_cast(other)) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 2; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit neon_mask( + uint64x2_t const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint64x2_t() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return static_cast( + reference(const_cast(m_value), int(i))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Derived + operator||(neon_mask const& other) const { + return Derived(vorrq_u64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Derived + operator&&(neon_mask const& other) const { + return Derived(vandq_u64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Derived operator!() const { + auto const true_value = static_cast(neon_mask(true)); + return Derived(veorq_u64(m_value, true_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator==( + neon_mask const& other) const { + uint64x2_t const elementwise_equality = vceqq_u64(m_value, other.m_value); + uint32x2_t const narrow_elementwise_equality = + vqmovn_u64(elementwise_equality); + uint64x1_t const overall_equality_neon = + vreinterpret_u64_u32(narrow_elementwise_equality); + uint64_t const overall_equality = vget_lane_u64(overall_equality_neon, 0); + return overall_equality == 0xFFFFFFFFFFFFFFFFULL; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator!=( + neon_mask const& other) const { + return !operator==(other); + } +}; + +template +class neon_mask { + uint32x2_t m_value; + + public: + class reference { + uint32x2_t& m_mask; + int m_lane; + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(uint32x2_t& mask_arg, + int lane_arg) + : m_mask(mask_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(bool value) const { + switch (m_lane) { + case 0: + m_mask = vset_lane_u32(value ? 0xFFFFFFFFU : 0, m_mask, 0); + break; + case 1: + m_mask = vset_lane_u32(value ? 0xFFFFFFFFU : 0, m_mask, 1); + break; + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator bool() const { + switch (m_lane) { + case 0: return vget_lane_u32(m_mask, 0) != 0; + case 1: return vget_lane_u32(m_mask, 1) != 0; + } + return false; + } + }; + using value_type = bool; + using abi_type = simd_abi::neon_fixed_size<2>; + using implementation_type = uint32x2_t; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit neon_mask(value_type value) + : m_value(vmov_n_u32(value ? 0xFFFFFFFFU : 0)) {} + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask(neon_mask const& other) + : m_value(vqmovn_u64(static_cast(other))) {} + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask(neon_mask const& other) + : m_value(static_cast(other)) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 2; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit neon_mask( + uint32x2_t const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint32x2_t() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return static_cast( + reference(const_cast(m_value), int(i))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Derived + operator||(neon_mask const& other) const { + return Derived(vorr_u32(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Derived + operator&&(neon_mask const& other) const { + return Derived(vand_u32(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Derived operator!() const { + auto const true_value = static_cast(neon_mask(true)); + return Derived(veor_u32(m_value, true_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator==( + neon_mask const& other) const { + uint32x2_t const elementwise_equality = vceq_u32(m_value, other.m_value); + uint64x1_t const overall_equality_neon = + vreinterpret_u64_u32(elementwise_equality); + uint64_t const overall_equality = vget_lane_u64(overall_equality_neon, 0); + return overall_equality == 0xFFFFFFFFFFFFFFFFULL; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator!=( + neon_mask const& other) const { + return !operator==(other); + } +}; + +} // namespace Impl + +template +class simd_mask> + : public Impl::neon_mask>, + sizeof(T) * 8> { + using base_type = Impl::neon_mask>, + sizeof(T) * 8>; + + public: + using implementation_type = typename base_type::implementation_type; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(bool value) + : base_type(value) {} + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask( + simd_mask> const& other) + : base_type(other) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + implementation_type const& value) + : base_type(value) {} +}; + +template <> +class simd> { + float64x2_t m_value; + + public: + using value_type = double; + using abi_type = simd_abi::neon_fixed_size<2>; + using mask_type = simd_mask; + class reference { + float64x2_t& m_value; + int m_lane; + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(float64x2_t& mask_arg, + int lane_arg) + : m_value(mask_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(double value) const { + switch (m_lane) { + case 0: m_value = vsetq_lane_f64(value, m_value, 0); break; + case 1: m_value = vsetq_lane_f64(value, m_value, 1); break; + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator double() const { + switch (m_lane) { + case 0: return vgetq_lane_f64(m_value, 0); + case 1: return vgetq_lane_f64(m_value, 1); + } + return 0; + } + }; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 2; + } + template , + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(vmovq_n_f64(value_type(value))) {} + template ()); } + std::is_invocable_r_v>, + bool> = false> + KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen) { + m_value = vsetq_lane_f64(gen(std::integral_constant()), + m_value, 0); + m_value = vsetq_lane_f64(gen(std::integral_constant()), + m_value, 1); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + float64x2_t const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reference(const_cast(this)->m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = vld1q_f64(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + vst1q_f64(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit + operator float64x2_t() const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator<(simd const& other) const { + return mask_type(vcltq_f64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator>(simd const& other) const { + return mask_type(vcgtq_f64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator<=(simd const& other) const { + return mask_type(vcleq_f64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator>=(simd const& other) const { + return mask_type(vcgeq_f64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator==(simd const& other) const { + return mask_type(vceqq_f64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator!=(simd const& other) const { + return !(operator==(other)); + } +}; + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator*(simd> const& lhs, + simd> const& rhs) { + return simd>( + vmulq_f64(static_cast(lhs), static_cast(rhs))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator/(simd> const& lhs, + simd> const& rhs) { + return simd>( + vdivq_f64(static_cast(lhs), static_cast(rhs))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator+(simd> const& lhs, + simd> const& rhs) { + return simd>( + vaddq_f64(static_cast(lhs), static_cast(rhs))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator-(simd> const& lhs, + simd> const& rhs) { + return simd>( + vsubq_f64(static_cast(lhs), static_cast(rhs))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator-(simd> const& a) { + return simd>( + vnegq_f64(static_cast(a))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +simd> abs( + simd> const& a) { + return simd>( + vabsq_f64(static_cast(a))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +simd> copysign( + simd> const& a, + simd> const& b) { + uint64x2_t const sign_mask = vreinterpretq_u64_f64(vmovq_n_f64(-0.0)); + return simd>(vreinterpretq_f64_u64( + vorrq_u64(vreinterpretq_u64_f64(static_cast(abs(a))), + vandq_u64(sign_mask, vreinterpretq_u64_f64( + static_cast(b)))))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +simd> sqrt( + simd> const& a) { + return simd>( + vsqrtq_f64(static_cast(a))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +simd> fma( + simd> const& a, + simd> const& b, + simd> const& c) { + return simd>( + vfmaq_f64(static_cast(c), static_cast(b), + static_cast(a))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +simd> max( + simd> const& a, + simd> const& b) { + return simd>( + vmaxq_f64(static_cast(a), static_cast(b))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +simd> min( + simd> const& a, + simd> const& b) { + return simd>( + vminq_f64(static_cast(a), static_cast(b))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + vbslq_f64(static_cast(a), static_cast(b), + static_cast(c))); +} + +template <> +class simd> { + int32x2_t m_value; + + public: + using value_type = std::int32_t; + using abi_type = simd_abi::neon_fixed_size<2>; + using mask_type = simd_mask; + class reference { + int32x2_t& m_value; + int m_lane; + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(int32x2_t& value_arg, + int lane_arg) + : m_value(value_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(std::int32_t value) const { + switch (m_lane) { + case 0: m_value = vset_lane_s32(value, m_value, 0); break; + case 1: m_value = vset_lane_s32(value, m_value, 1); break; + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator std::int32_t() const { + switch (m_lane) { + case 0: return vget_lane_s32(m_value, 0); + case 1: return vget_lane_s32(m_value, 1); + } + return 0; + } + }; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 2; + } + template , + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(vmov_n_s32(value_type(value))) {} + template >, + bool> = false> + KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen) { + m_value = vset_lane_s32(gen(std::integral_constant()), + m_value, 0); + m_value = vset_lane_s32(gen(std::integral_constant()), + m_value, 1); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + int32x2_t const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd const& other); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reference(const_cast(this)->m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = vld1_s32(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + vst1_s32(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int32x2_t() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator==(simd const& other) const { + return mask_type(vceq_s32(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator>(simd const& other) const { + return mask_type(vcgt_s32(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator<(simd const& other) const { + return mask_type(vclt_s32(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator<=(simd const& other) const { + return mask_type(vcle_s32(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator>=(simd const& other) const { + return mask_type(vcge_s32(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator!=(simd const& other) const { + return !((*this) == other); + } +}; + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator-(simd> const& a) { + return simd>( + vneg_s32(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator-(simd> const& lhs, + simd> const& rhs) { + return simd>( + vsub_s32(static_cast(lhs), static_cast(rhs))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator+(simd> const& lhs, + simd> const& rhs) { + return simd>( + vadd_s32(static_cast(lhs), static_cast(rhs))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + vbsl_s32(static_cast(a), static_cast(b), + static_cast(c))); +} + +template <> +class simd> { + int64x2_t m_value; + + public: + using value_type = std::int64_t; + using abi_type = simd_abi::neon_fixed_size<2>; + using mask_type = simd_mask; + class reference { + int64x2_t& m_value; + int m_lane; + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(int64x2_t& value_arg, + int lane_arg) + : m_value(value_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(std::int64_t value) const { + switch (m_lane) { + case 0: m_value = vsetq_lane_s64(value, m_value, 0); break; + case 1: m_value = vsetq_lane_s64(value, m_value, 1); break; + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator std::int64_t() const { + switch (m_lane) { + case 0: return vgetq_lane_s64(m_value, 0); + case 1: return vgetq_lane_s64(m_value, 1); + } + return 0; + } + }; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 2; + } + template , + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(vmovq_n_s64(value_type(value))) {} + template >, + bool> = false> + KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen) { + m_value = vsetq_lane_s64(gen(std::integral_constant()), + m_value, 0); + m_value = vsetq_lane_s64(gen(std::integral_constant()), + m_value, 1); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + int64x2_t const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd const&); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reference(const_cast(this)->m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = vld1q_s64(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + vst1q_s64(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int64x2_t() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator==(simd const& other) const { + return mask_type(vceqq_s64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator>(simd const& other) const { + return mask_type(vcgtq_s64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator<(simd const& other) const { + return mask_type(vcltq_s64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator<=(simd const& other) const { + return mask_type(vcleq_s64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator>=(simd const& other) const { + return mask_type(vcgeq_s64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator!=(simd const& other) const { + return !((*this) == other); + } +}; + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator-(simd> const& a) { + return simd>( + vnegq_s64(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator-(simd> const& lhs, + simd> const& rhs) { + return simd>( + vsubq_s64(static_cast(lhs), static_cast(rhs))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator+(simd> const& lhs, + simd> const& rhs) { + return simd>( + vaddq_s64(static_cast(lhs), static_cast(rhs))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + vbslq_s64(static_cast(a), static_cast(b), + static_cast(c))); +} + +template <> +class simd> { + uint64x2_t m_value; + + public: + using value_type = std::uint64_t; + using abi_type = simd_abi::neon_fixed_size<2>; + using mask_type = simd_mask; + class reference { + uint64x2_t& m_value; + int m_lane; + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(uint64x2_t& value_arg, + int lane_arg) + : m_value(value_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(std::uint64_t value) const { + switch (m_lane) { + case 0: m_value = vsetq_lane_u64(value, m_value, 0); break; + case 1: m_value = vsetq_lane_u64(value, m_value, 1); break; + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator std::uint64_t() const { + switch (m_lane) { + case 0: return vgetq_lane_u64(m_value, 0); + case 1: return vgetq_lane_u64(m_value, 1); + } + return 0; + } + }; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 2; + } + template , + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(vmovq_n_u64(value_type(value))) {} + template >, + bool> = false> + KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen) { + m_value = vsetq_lane_u64(gen(std::integral_constant()), + m_value, 0); + m_value = vsetq_lane_u64(gen(std::integral_constant()), + m_value, 1); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + uint64x2_t const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd const& other) + : m_value( + vreinterpretq_u64_s64(vmovl_s32(static_cast(other)))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reference(const_cast(this)->m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator&(simd const& other) const { + return simd(vandq_u64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator|(simd const& other) const { + return simd(vorrq_u64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint64x2_t() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator<<(unsigned int rhs) const { + return simd(vshlq_u64(m_value, vmovq_n_s64(std::int64_t(rhs)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator>>(unsigned int rhs) const { + return simd(vshlq_u64(m_value, vmovq_n_s64(-std::int64_t(rhs)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator==(simd const& other) const { + return mask_type(vceqq_u64(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type + operator!=(simd const& other) const { + return !((*this) == other); + } +}; + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator-(simd> const& lhs, + simd> const& rhs) { + return simd>( + vsubq_u64(static_cast(lhs), static_cast(rhs))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + operator+(simd> const& lhs, + simd> const& rhs) { + return simd>( + vaddq_u64(static_cast(lhs), static_cast(rhs))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +simd>::simd( + simd> const& other) + : m_value( + vmovn_s64(vreinterpretq_s64_u64(static_cast(other)))) {} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +simd>::simd( + simd> const& other) + : m_value(vreinterpretq_s64_u64(static_cast(other))) {} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + vbslq_u64(static_cast(a), static_cast(b), + static_cast(c))); +} + +template <> +class const_where_expression>, + simd>> { + public: + using abi_type = simd_abi::neon_fixed_size<2>; + using value_type = simd; + using mask_type = simd_mask; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast(value_arg)), m_mask(mask_arg) {} + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const& + mask() const { + return m_mask; + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const& + value() const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, element_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + double* mem, + simd> const& index) const { + if (m_mask[0]) mem[index[0]] = m_value[0]; + if (m_mask[1]) mem[index[1]] = m_value[1]; + } +}; + +template <> +class where_expression>, + simd>> + : public const_where_expression< + simd_mask>, + simd>> { + public: + where_expression( + simd_mask> const& mask_arg, + simd>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, element_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + double const* mem, + simd> const& index) { + if (m_mask[0]) m_value[0] = mem[index[0]]; + if (m_mask[1]) m_value[1] = mem[index[1]]; + } + template >>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast>>( + std::forward(x)); + m_value = static_cast>>( + vbslq_f64(static_cast(m_mask), + static_cast(x_as_value_type), + static_cast(m_value))); + } +}; + +template <> +class const_where_expression< + simd_mask>, + simd>> { + public: + using abi_type = simd_abi::neon_fixed_size<2>; + using value_type = simd; + using mask_type = simd_mask; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast(value_arg)), m_mask(mask_arg) {} + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const& + mask() const { + return m_mask; + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const& + value() const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, element_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } +}; + +template <> +class where_expression>, + simd>> + : public const_where_expression< + simd_mask>, + simd>> { + public: + where_expression( + simd_mask> const& mask_arg, + simd>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, element_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } +}; + +} // namespace Experimental +} // namespace Kokkos + +#endif diff --git a/simd/unit_tests/TestSIMD.cpp b/simd/unit_tests/TestSIMD.cpp index ad6ce9bac3..7a4ecf19ed 100644 --- a/simd/unit_tests/TestSIMD.cpp +++ b/simd/unit_tests/TestSIMD.cpp @@ -170,7 +170,7 @@ void host_check_binary_op_one_loader(BinaryOp binary_op, std::size_t n, simd_type expected_result; for (std::size_t lane = 0; lane < nlanes; ++lane) { expected_result[lane] = - binary_op.on_host(first_arg[lane], second_arg[lane]); + binary_op.on_host(T(first_arg[lane]), T(second_arg[lane])); } simd_type const computed_result = binary_op.on_host(first_arg, second_arg); host_check_equality(expected_result, computed_result, nlanes); @@ -298,6 +298,61 @@ inline void host_check_mask_ops() { EXPECT_FALSE(all_of(mask_type(false))); } +template +inline void host_check_conversions() { + { + auto a = Kokkos::Experimental::simd(1); + auto b = Kokkos::Experimental::simd(a); + EXPECT_TRUE(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd(1); + auto b = Kokkos::Experimental::simd(a); + EXPECT_TRUE(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd(1); + auto b = Kokkos::Experimental::simd(a); + EXPECT_TRUE(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + EXPECT_TRUE(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + EXPECT_TRUE(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + EXPECT_TRUE(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + EXPECT_TRUE(b == decltype(b)(true)); + } +} + +template +inline void host_check_shifts() { + auto a = Kokkos::Experimental::simd(8); + auto b = a >> 1; + EXPECT_TRUE(all_of(b == decltype(b)(4))); +} + +template +inline void host_check_condition() { + auto a = Kokkos::Experimental::condition( + Kokkos::Experimental::simd(1) > 0, + Kokkos::Experimental::simd(16), + Kokkos::Experimental::simd(20)); + EXPECT_TRUE(all_of(a == decltype(a)(16))); +} + template KOKKOS_INLINE_FUNCTION void device_check_math_ops() { std::size_t constexpr n = 11; @@ -321,16 +376,80 @@ KOKKOS_INLINE_FUNCTION void device_check_mask_ops() { checker.truth(!all_of(mask_type(false))); } +template +KOKKOS_INLINE_FUNCTION void device_check_conversions() { + kokkos_checker checker; + { + auto a = Kokkos::Experimental::simd(1); + auto b = Kokkos::Experimental::simd(a); + checker.truth(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd(1); + auto b = Kokkos::Experimental::simd(a); + checker.truth(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd(1); + auto b = Kokkos::Experimental::simd(a); + checker.truth(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + checker.truth(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + checker.truth(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + checker.truth(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask(true); + auto b = Kokkos::Experimental::simd_mask(a); + checker.truth(b == decltype(b)(true)); + } +} + +template +KOKKOS_INLINE_FUNCTION void device_check_shifts() { + kokkos_checker checker; + auto a = Kokkos::Experimental::simd(8); + auto b = a >> 1; + checker.truth(all_of(b == decltype(b)(4))); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_condition() { + kokkos_checker checker; + auto a = Kokkos::Experimental::condition( + Kokkos::Experimental::simd(1) > 0, + Kokkos::Experimental::simd(16), + Kokkos::Experimental::simd(20)); + checker.truth(all_of(a == decltype(a)(16))); +} + template inline void host_check_abi() { host_check_math_ops(); host_check_mask_ops(); + host_check_conversions(); + host_check_shifts(); + host_check_condition(); } template KOKKOS_INLINE_FUNCTION void device_check_abi() { device_check_math_ops(); device_check_mask_ops(); + device_check_conversions(); + device_check_shifts(); + device_check_condition(); } inline void host_check_abis(Kokkos::Experimental::Impl::abi_set<>) {} From 59067d41e5a6265fef4547e0045edf8d276c0f38 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 31 Jan 2023 07:57:06 -0500 Subject: [PATCH 130/496] Use raw literal string to avoid having to escape characters in git commit message (#5823) * Use raw literal string to avoid having to escape characters (such as ") in git commit message * Fixup cxx standard for library with the git info * Use raw literal string got git branch as well Co-Authored-By: Phil Miller * Prefer cxx_raw_string_literals compile feature to setting CMAKE_CXX_STANDARD --------- Co-authored-by: Phil Miller --- cmake/Kokkos_Version_Info.cpp.in | 11 ++++++----- cmake/build_env_info.cmake | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cmake/Kokkos_Version_Info.cpp.in b/cmake/Kokkos_Version_Info.cpp.in index e9fabe8177..3665282e7b 100644 --- a/cmake/Kokkos_Version_Info.cpp.in +++ b/cmake/Kokkos_Version_Info.cpp.in @@ -19,11 +19,12 @@ namespace Kokkos { namespace Impl { -std::string GIT_BRANCH = "@GIT_BRANCH@"; -std::string GIT_COMMIT_HASH = "@GIT_COMMIT_HASH@"; -std::string GIT_CLEAN_STATUS = "@GIT_CLEAN_STATUS@"; -std::string GIT_COMMIT_DESCRIPTION = "@GIT_COMMIT_DESCRIPTION@"; -std::string GIT_COMMIT_DATE = "@GIT_COMMIT_DATE@"; +std::string GIT_BRANCH = R"branch(@GIT_BRANCH@)branch"; +std::string GIT_COMMIT_HASH = "@GIT_COMMIT_HASH@"; +std::string GIT_CLEAN_STATUS = "@GIT_CLEAN_STATUS@"; +std::string GIT_COMMIT_DESCRIPTION = + R"message(@GIT_COMMIT_DESCRIPTION@)message"; +std::string GIT_COMMIT_DATE = "@GIT_COMMIT_DATE@"; } // namespace Impl diff --git a/cmake/build_env_info.cmake b/cmake/build_env_info.cmake index 2cd169cba4..0eeb637245 100644 --- a/cmake/build_env_info.cmake +++ b/cmake/build_env_info.cmake @@ -110,6 +110,7 @@ FUNCTION(check_git_setup) add_library(impl_git_version ${CMAKE_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) target_include_directories(impl_git_version PUBLIC ${CMAKE_BINARY_DIR}/generated) + target_compile_features(impl_git_version PRIVATE cxx_raw_string_literals) add_dependencies(impl_git_version AlwaysCheckGit) check_git_version() From 0130a3f2e240833c7ce9f36da84980f05be613a4 Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Tue, 31 Jan 2023 07:59:55 -0500 Subject: [PATCH 131/496] Initial OpenACC parallel_reduce implementation for Team policy (#5610) * Initial OpenACC parallel_reduce implementation for Team policy * Clang-format * Apply suggestions from code review Apply suggestions from code review (@masterleinad and @dalg24) Co-authored-by: Daniel Arndt Co-authored-by: Damien L-G * Minor modification as suggested by code review. * Change `always_false` to `always_true`. * Minor fix on parallel_reduce() implementation. * Apply suggestions from code review Co-authored-by: Damien L-G * Revert `std::enable_if_t::value>* = nullptr)` back to `std::enable_if_t::value, void*> = nullptr)` (No definition of is_view_v<> found). * Change `const FunctorType a_functor(m_functor);` to `auto const a_functor = m_functor;` * Rebase this branch and change is_view<> to is_view_v<> * Updated the copyright and rebased this branch. * Add a comment to the parallel-reduce() implementation * Comment out the hierarchical reduction implementations not used for now. * Re-factored the parallel_reduce construct with Team policy to support different reduction types. * Add a missing acc routine directive. * Re-factor parallel_reduce(team policy) constructs. * Set KOKKOS_OPENACC_FEATURE_LEVEL to 14 in core/unit_test/CMakeLists.txt. * Fix minor bugs in the OpenACC parallel_reduce(Team Policy) Versions with KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS macro enabled and disabled should have different league loop implementations. * Change "auto const a_functor = m_functor;" back to "auto const a_functor(m_functor);" --------- Co-authored-by: Daniel Arndt Co-authored-by: Damien L-G --- .../Kokkos_OpenACC_ParallelFor_Team.hpp | 1 - .../Kokkos_OpenACC_ParallelReduce_Team.hpp | 428 ++++++++++++++++++ core/src/OpenACC/Kokkos_OpenACC_Team.hpp | 28 +- core/src/decl/Kokkos_Declare_OPENACC.hpp | 1 + core/unit_test/CMakeLists.txt | 2 +- .../incremental/Test12a_ThreadScratch.hpp | 4 + .../incremental/Test12b_TeamScratch.hpp | 4 + 7 files changed, 453 insertions(+), 15 deletions(-) create mode 100644 core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp index c08c15879c..1dc7b28912 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp @@ -17,7 +17,6 @@ #ifndef KOKKOS_OPENACC_PARALLEL_FOR_TEAM_HPP #define KOKKOS_OPENACC_PARALLEL_FOR_TEAM_HPP -#include #include #include diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp new file mode 100644 index 0000000000..199a2786ee --- /dev/null +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -0,0 +1,428 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENACC_PARALLEL_REDUCE_TEAM_HPP +#define KOKKOS_OPENACC_PARALLEL_REDUCE_TEAM_HPP + +#include +#include +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// Hierarchical Parallelism -> Team level implementation +namespace Kokkos::Experimental::Impl { + +// primary template: catch-all non-implemented custom reducers +template > +struct OpenACCParallelReduceTeamHelper { + OpenACCParallelReduceTeamHelper(Functor const&, Reducer const&, + Policy const&) { + static_assert(!Kokkos::Impl::always_true::value, + "not implemented"); + } +}; + +} // namespace Kokkos::Experimental::Impl + +template +class Kokkos::Impl::ParallelReduce, + ReducerType, Kokkos::Experimental::OpenACC> { + private: + using Policy = + TeamPolicyInternal; + + using ReducerConditional = + Kokkos::Impl::if_c::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using Analysis = + FunctorAnalysis; + using value_type = typename Analysis::value_type; + using pointer_type = typename Analysis::pointer_type; + + FunctorType m_functor; + Policy m_policy; + ReducerType m_reducer; + pointer_type m_result_ptr; + + public: + void execute() const { + auto league_size = m_policy.league_size(); + auto team_size = m_policy.team_size(); + auto vector_length = m_policy.impl_vector_length(); + + value_type tmp; + typename Analysis::Reducer final_reducer( + &ReducerConditional::select(m_functor, m_reducer)); + final_reducer.init(&tmp); + + Kokkos::Experimental::Impl::OpenACCParallelReduceTeamHelper( + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor), + std::conditional_t, ReducerType, + Sum>(tmp), + m_policy); + + m_result_ptr[0] = tmp; + } + + template + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ViewType& arg_result_view, + std::enable_if_t>* = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result_view.data()) {} + + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()) {} +}; + +namespace Kokkos { + +// Hierarchical Parallelism -> Team thread level implementation +// FIXME_OPENACC: custom reduction is not implemented. +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct& + loop_boundaries, + const Lambda& lambda, const JoinType& join, ValueType& init_result) { + static_assert(!Kokkos::Impl::always_true::value, + "custom reduction is not implemented"); +} + +// Hierarchical Parallelism -> Thread vector level implementation +// FIXME_OPENACC: custom reduction is not implemented. +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenACCTeamMember>& loop_boundaries, + const Lambda& lambda, const JoinType& join, ValueType& init_result) { + static_assert(!Kokkos::Impl::always_true::value, + "custom reduction is not implemented"); +} + +} // namespace Kokkos + +#ifdef KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS + +#define KOKKOS_IMPL_ACC_REDUCE_TEAM_PRAGMA \ + vector vector_length(team_size* vector_length) +#define KOKKOS_IMPL_ACC_REDUCE_TEAM_ITRS league_size* team_size* vector_length +#define KOKKOS_IMPL_ACC_REDUCE_TEAM_LEAGUE_ID_INIT \ + i / (team_size * vector_length) + +namespace Kokkos { + +// Hierarchical Parallelism -> Team thread level implementation +#pragma acc routine seq +template +KOKKOS_INLINE_FUNCTION std::enable_if_t> +parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenACCTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + ValueType tmp = ValueType(); + iType j_start = + loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); + if (j_start == 0) { +#pragma acc loop seq + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) + lambda(i, tmp); + result = tmp; + } +} + +#pragma acc routine seq +template +KOKKOS_INLINE_FUNCTION std::enable_if_t> +parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenACCTeamMember>& loop_boundaries, + const Lambda& lambda, const ReducerType& reducer) { + using ValueType = typename ReducerType::value_type; + ValueType tmp; + reducer.init(tmp); + iType j_start = + loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); + if (j_start == 0) { +#pragma acc loop seq + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) + lambda(i, tmp); + reducer.reference() = tmp; + } +} + +// Hierarchical Parallelism -> Thread vector level implementation +#pragma acc routine seq +template +KOKKOS_INLINE_FUNCTION std::enable_if_t> +parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenACCTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + ValueType tmp = ValueType(); + iType j_start = + loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); + if (j_start == 0) { +#pragma acc loop seq + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, tmp); + } + result = tmp; + } +} + +#pragma acc routine seq +template +KOKKOS_INLINE_FUNCTION std::enable_if_t> +parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenACCTeamMember>& loop_boundaries, + const Lambda& lambda, const ReducerType& reducer) { + using ValueType = typename ReducerType::value_type; + ValueType tmp; + reducer.init(tmp); + iType j_start = + loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); + if (j_start == 0) { +#pragma acc loop seq + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, tmp); + } + reducer.reference() = tmp; + } +} + +// Hierarchical Parallelism -> Team vector level implementation +#pragma acc routine seq +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamVectorRangeBoundariesStruct& + loop_boundaries, + const Lambda& lambda, ValueType& result) { + ValueType tmp = ValueType(); + iType j_start = + loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); + if (j_start == 0) { +#pragma acc loop seq + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, tmp); + } + result = tmp; + } +} + +} // namespace Kokkos + +#else /* #ifdef KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS */ + +#define KOKKOS_IMPL_ACC_REDUCE_TEAM_PRAGMA \ + num_workers(team_size) vector_length(vector_length) +#define KOKKOS_IMPL_ACC_REDUCE_TEAM_ITRS league_size +#define KOKKOS_IMPL_ACC_REDUCE_TEAM_LEAGUE_ID_INIT i + +// FIXME_OPENACC: below implementation conforms to the OpenACC standard, but +// the NVHPC compiler (V22.11) fails due to the lack of support for lambda +// expressions containing parallel loops. + +namespace Kokkos { + +// Hierarchical Parallelism -> Team thread level implementation +#pragma acc routine worker +template +KOKKOS_INLINE_FUNCTION std::enable_if_t> +parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenACCTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + ValueType tmp = ValueType(); +#pragma acc loop worker reduction(+ : tmp) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) + lambda(i, tmp); + result = tmp; +} + +#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(REDUCER, OPERATOR) \ + KOKKOS_IMPL_ACC_PRAGMA(routine worker) \ + template \ + KOKKOS_INLINE_FUNCTION \ + std::enable_if_t>> \ + parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< \ + iType, Impl::OpenACCTeamMember>& loop_boundaries, \ + const Lambda& lambda, \ + const Kokkos::REDUCER& reducer) { \ + using ValueType = typename Kokkos::REDUCER::value_type; \ + ValueType tmp = ValueType(); \ + reducer.init(tmp); \ + KOKKOS_IMPL_ACC_PRAGMA(loop worker reduction(OPERATOR : tmp)) \ + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) \ + lambda(i, tmp); \ + reducer.reference() = tmp; \ + } + +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(Sum, +); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(Prod, *); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(Min, min); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(Max, max); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(LAnd, &&); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(LOr, ||); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(BAnd, &); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(BOr, |); + +// Hierarchical Parallelism -> Thread vector level implementation +#pragma acc routine vector +template +KOKKOS_INLINE_FUNCTION std::enable_if_t> +parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenACCTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + ValueType tmp = ValueType(); +#pragma acc loop vector reduction(+ : tmp) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, tmp); + } + result = tmp; +} + +#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(REDUCER, OPERATOR) \ + KOKKOS_IMPL_ACC_PRAGMA(routine vector) \ + template \ + KOKKOS_INLINE_FUNCTION \ + std::enable_if_t>> \ + parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< \ + iType, Impl::OpenACCTeamMember>& loop_boundaries, \ + const Lambda& lambda, \ + const Kokkos::REDUCER& reducer) { \ + using ValueType = typename Kokkos::REDUCER::value_type; \ + ValueType tmp; \ + reducer.init(tmp); \ + KOKKOS_IMPL_ACC_PRAGMA(loop vector reduction(OPERATOR : tmp)) \ + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { \ + lambda(i, tmp); \ + } \ + reducer.reference() = tmp; \ + } + +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(Sum, +); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(Prod, *); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(Min, min); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(Max, max); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(LAnd, &&); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(LOr, ||); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(BAnd, &); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(BOr, |); + +// Hierarchical Parallelism -> Team vector level implementation +#pragma acc routine vector +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamVectorRangeBoundariesStruct& + loop_boundaries, + const Lambda& lambda, ValueType& result) { + ValueType tmp = ValueType(); +#pragma acc loop vector reduction(+ : tmp) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, tmp); + } + result = tmp; +} + +} // namespace Kokkos + +#undef KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD +#undef KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR + +#endif /* #ifdef KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS */ + +#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_SCHEDULE(REDUCER, \ + OPERATOR) \ + namespace Kokkos::Experimental::Impl { \ + template \ + void OpenACCParallelReduceTeam##REDUCER(Policy const policy, \ + ValueType& aval, \ + Functor const& afunctor, \ + int async_arg) { \ + auto const functor = afunctor; \ + auto val = aval; \ + auto const league_size = policy.league_size(); \ + auto const team_size = policy.team_size(); \ + auto const vector_length = policy.impl_vector_length(); \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang num_gangs(league_size) KOKKOS_IMPL_ACC_REDUCE_TEAM_PRAGMA reduction(OPERATOR : val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (int i = 0; i < KOKKOS_IMPL_ACC_REDUCE_TEAM_ITRS; i++) { \ + int league_id = KOKKOS_IMPL_ACC_REDUCE_TEAM_LEAGUE_ID_INIT; \ + typename Policy::member_type team(league_id, league_size, team_size, \ + vector_length); \ + functor(team, val); \ + } \ + aval = val; \ + } \ + } // namespace Kokkos::Experimental::Impl + +#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(REDUCER, OPERATOR) \ + KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_SCHEDULE(REDUCER, OPERATOR) \ + \ + template \ + struct Kokkos::Experimental::Impl::OpenACCParallelReduceTeamHelper< \ + Functor, Kokkos::REDUCER, \ + Kokkos::Impl::TeamPolicyInternal, true> { \ + using Policy = Kokkos::Impl::TeamPolicyInternal; \ + using Reducer = REDUCER; \ + using ValueType = typename Reducer::value_type; \ + \ + OpenACCParallelReduceTeamHelper(Functor const& functor, \ + Reducer const& reducer, \ + Policy const& policy) { \ + auto league_size = policy.league_size(); \ + auto team_size = policy.team_size(); \ + auto vector_length = policy.impl_vector_length(); \ + \ + if (league_size <= 0) { \ + return; \ + } \ + \ + ValueType val; \ + reducer.init(val); \ + \ + int const async_arg = policy.space().acc_async_queue(); \ + \ + OpenACCParallelReduceTeam##REDUCER(policy, val, functor, async_arg); \ + \ + reducer.reference() = val; \ + } \ + } + +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(Sum, +); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(Prod, *); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(Min, min); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(Max, max); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(LAnd, &&); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(LOr, ||); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(BAnd, &); +KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(BOr, |); + +#undef KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER +#undef KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_SCHEDULE +#undef KOKKOS_IMPL_ACC_REDUCE_TEAM_PRAGMA +#undef KOKKOS_IMPL_ACC_REDUCE_TEAM_ITRS +#undef KOKKOS_IMPL_ACC_REDUCE_TEAM_LEAGUE_ID_INIT + +#endif /* #ifndef KOKKOS_OPENACC_PARALLEL_REDUCE_TEAM_HPP */ diff --git a/core/src/OpenACC/Kokkos_OpenACC_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_Team.hpp index 3a46f2c483..4ec71f56ef 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_Team.hpp @@ -82,7 +82,7 @@ class OpenACCTeamMember { // FIXME_OPENACC: team_broadcast() is not implemented. template KOKKOS_FUNCTION void team_broadcast(ValueType& value, int thread_id) const { - static_assert(Kokkos::Impl::always_false::value, + static_assert(!Kokkos::Impl::always_true::value, "Kokkos Error: team_broadcast() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -99,7 +99,7 @@ class OpenACCTeamMember { template KOKKOS_FUNCTION ValueType team_reduce(const ValueType& value, const JoinOp& op_in) const { - static_assert(Kokkos::Impl::always_false::value, + static_assert(!Kokkos::Impl::always_true::value, "Kokkos Error: team_reduce() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -110,7 +110,7 @@ class OpenACCTeamMember { KOKKOS_FUNCTION ArgType team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { static_assert( - Kokkos::Impl::always_false::value, + !Kokkos::Impl::always_true::value, "Kokkos Error: team_scan() is not implemented for the OpenACC backend"); return ArgType(); } @@ -163,37 +163,37 @@ class TeamPolicyInternal // implementations. template static int team_size_max(const FunctorType&, const ParallelForTag&) { - return DEFAULT_TEAM_SIZE_MAX; + return default_team_size_max; } template static int team_size_max(const FunctorType&, const ParallelReduceTag&) { - return DEFAULT_TEAM_SIZE_MAX; + return default_team_size_max; } template static int team_size_max(const FunctorType&, const ReducerType&, const ParallelReduceTag&) { - return DEFAULT_TEAM_SIZE_MAX; + return default_team_size_max; } // FIXME_OPENACC: update team_size_recommended() APIs with realistic // implementations. template static int team_size_recommended(const FunctorType&, const ParallelForTag&) { - return DEFAULT_TEAM_SIZE_REC; + return default_team_size; } template static int team_size_recommended(const FunctorType&, const ParallelReduceTag&) { - return DEFAULT_TEAM_SIZE_REC; + return default_team_size; } template static int team_size_recommended(const FunctorType&, const ReducerType&, const ParallelReduceTag&) { - return DEFAULT_TEAM_SIZE_REC; + return default_team_size; } //---------------------------------------- @@ -208,7 +208,9 @@ class TeamPolicyInternal std::array m_thread_scratch_size; bool m_tune_team_size; bool m_tune_vector_length; - constexpr static const size_t default_team_size = + constexpr static int default_team_size_max = + OpenACCTeamMember::DEFAULT_TEAM_SIZE_MAX; + constexpr static int default_team_size = OpenACCTeamMember::DEFAULT_TEAM_SIZE_REC; int m_chunk_size; @@ -226,8 +228,8 @@ class TeamPolicyInternal public: bool impl_auto_team_size() const { return m_tune_team_size; } bool impl_auto_vector_length() const { return m_tune_vector_length; } - void impl_set_team_size(const size_t size) { m_team_size = size; } - void impl_set_vector_length(const size_t length) { + void impl_set_team_size(const int size) { m_team_size = size; } + void impl_set_vector_length(const int length) { m_tune_vector_length = length; } int impl_vector_length() const { return m_vector_length; } @@ -348,7 +350,7 @@ class TeamPolicyInternal m_chunk_size(0) { init(league_size_request, team_size_request, 1); } - static size_t vector_length_max() { + static int vector_length_max() { return 32; /* TODO: this is bad. Need logic that is compiler and backend aware */ } diff --git a/core/src/decl/Kokkos_Declare_OPENACC.hpp b/core/src/decl/Kokkos_Declare_OPENACC.hpp index 40c29104bf..177af9b23d 100644 --- a/core/src/decl/Kokkos_Declare_OPENACC.hpp +++ b/core/src/decl/Kokkos_Declare_OPENACC.hpp @@ -28,6 +28,7 @@ #include #include #include +#include #endif #endif diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index e32bf09fd4..74be49266f 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -58,7 +58,7 @@ SET(KOKKOS_SYCL_FEATURE_LEVEL 999) SET(KOKKOS_SYCL_NAME Experimental::SYCL) SET(KOKKOS_THREADS_FEATURE_LEVEL 999) SET(KOKKOS_THREADS_NAME Threads) -SET(KOKKOS_OPENACC_FEATURE_LEVEL 11) +SET(KOKKOS_OPENACC_FEATURE_LEVEL 14) SET(KOKKOS_OPENACC_NAME Experimental::OpenACC) diff --git a/core/unit_test/incremental/Test12a_ThreadScratch.hpp b/core/unit_test/incremental/Test12a_ThreadScratch.hpp index a4cd4fc56f..8c97043f30 100644 --- a/core/unit_test/incremental/Test12a_ThreadScratch.hpp +++ b/core/unit_test/incremental/Test12a_ThreadScratch.hpp @@ -98,6 +98,10 @@ struct ThreadScratch { TEST(TEST_CATEGORY, IncrTest_12a_ThreadScratch) { ThreadScratch test; +#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC + GTEST_SKIP() << "skipping since scratch memory is not yet implemented in the " + "OpenACC backend"; +#endif // FIXME_OPENMPTARGET - team_size has to be a multiple of 32 for the tests to // pass in the Release and RelWithDebInfo builds. Does not need the team_size // to be a multiple of 32 for the Debug builds. diff --git a/core/unit_test/incremental/Test12b_TeamScratch.hpp b/core/unit_test/incremental/Test12b_TeamScratch.hpp index 9b27e35dfe..0ebb5c50fb 100644 --- a/core/unit_test/incremental/Test12b_TeamScratch.hpp +++ b/core/unit_test/incremental/Test12b_TeamScratch.hpp @@ -88,6 +88,10 @@ struct TeamScratch { TEST(TEST_CATEGORY, IncrTest_12b_TeamScratch) { TeamScratch test; +#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC + GTEST_SKIP() << "skipping since scratch memory is not yet implemented in the " + "OpenACC backend"; +#endif // FIXME_OPENMPTARGET - team_size has to be a multiple of 32 for the tests to // pass in the Release and RelWithDebInfo builds. Does not need the team_size // to be a multiple of 32 for the Debug builds. From ba4ebc4032ba8929c4dbcec30c8fe2fc0de7fd58 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 31 Jan 2023 20:43:35 +0000 Subject: [PATCH 132/496] Restrict KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED feature macro detection to static libraries --- cmake/kokkos_arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index b60215e60a..7f50b58895 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -526,7 +526,7 @@ ENDIF() # Check support for device_global variables # FIXME_SYCL Once the feature test macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL is # available, use that instead. -IF(KOKKOS_ENABLE_SYCL) +IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS) INCLUDE(CheckCXXSourceCompiles) STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") CHECK_CXX_SOURCE_COMPILES(" From 71e0ecaf47b07864689a120fba75835c7196bc8b Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Tue, 31 Jan 2023 12:53:40 -0700 Subject: [PATCH 133/496] CRS: Use Kokkos device function macros rather than duplicating code when compiling for GPU targets --- core/src/Kokkos_Crs.hpp | 40 ++-------------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/core/src/Kokkos_Crs.hpp b/core/src/Kokkos_Crs.hpp index 1f298a70cb..92931b5849 100644 --- a/core/src/Kokkos_Crs.hpp +++ b/core/src/Kokkos_Crs.hpp @@ -304,11 +304,11 @@ struct CountAndFillBase { Functor m_functor; counts_type m_counts; struct Count {}; - inline void operator()(Count, size_type i) const { + KOKKOS_FUNCTION void operator()(Count, size_type i) const { m_counts(i) = m_functor(i, nullptr); } struct Fill {}; - inline void operator()(Fill, size_type i) const { + KOKKOS_FUNCTION void operator()(Fill, size_type i) const { auto j = m_crs.row_map(i); /* we don't want to access entries(entries.size()), even if its just to get its address and never use it. this can happen when row (i) is empty and @@ -323,42 +323,6 @@ struct CountAndFillBase { CountAndFillBase(CrsType& crs, Functor const& f) : m_crs(crs), m_functor(f) {} }; -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) -#if defined(KOKKOS_ENABLE_CUDA) -#define EXEC_SPACE Kokkos::Cuda -#elif defined(KOKKOS_ENABLE_HIP) -#define EXEC_SPACE Kokkos::HIP -#endif -template -struct CountAndFillBase { - using data_type = typename CrsType::data_type; - using size_type = typename CrsType::size_type; - using row_map_type = typename CrsType::row_map_type; - using counts_type = row_map_type; - CrsType m_crs; - Functor m_functor; - counts_type m_counts; - struct Count {}; - __device__ inline void operator()(Count, size_type i) const { - m_counts(i) = m_functor(i, nullptr); - } - struct Fill {}; - __device__ inline void operator()(Fill, size_type i) const { - auto j = m_crs.row_map(i); - /* we don't want to access entries(entries.size()), even if its just to get - its address and never use it. this can happen when row (i) is empty and - all rows after it are also empty. we could compare to row_map(i + 1), but - that is a read from global memory, whereas dimension_0() should be part - of the View in registers (or constant memory) */ - data_type* fill = (j == static_cast(m_crs.entries.extent(0))) - ? nullptr - : (&(m_crs.entries(j))); - m_functor(i, fill); - } - CountAndFillBase(CrsType& crs, Functor const& f) : m_crs(crs), m_functor(f) {} -}; -#endif - template struct CountAndFill : public CountAndFillBase { using base_type = CountAndFillBase; From fbceafdd0842fd020721b82e19b9b4f824fd7ccc Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Tue, 31 Jan 2023 13:18:03 -0700 Subject: [PATCH 134/496] CUDA: Convert simple value macro to constexpr --- core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 572de89061..bee22bd2c7 100644 --- a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -62,31 +62,31 @@ template __device__ inline void cuda_inter_warp_reduction( ValueType& value, const ReducerType& reducer, const int max_active_thread = blockDim.y) { -#define STEP_WIDTH 4 + constexpr int step_width = 4; // Depending on the ValueType _shared__ memory must be aligned up to 8byte // boundaries The reason not to use ValueType directly is that for types with // constructors it could lead to race conditions alignas(alignof(ValueType) > alignof(double) ? alignof(ValueType) : alignof(double)) - __shared__ double sh_result[(sizeof(ValueType) + 7) / 8 * STEP_WIDTH]; + __shared__ double sh_result[(sizeof(ValueType) + 7) / 8 * step_width]; ValueType* result = (ValueType*)&sh_result; const int step = 32 / blockDim.x; - int shift = STEP_WIDTH; + int shift = step_width; const int id = threadIdx.y % step == 0 ? threadIdx.y / step : 65000; - if (id < STEP_WIDTH) { + if (id < step_width) { result[id] = value; } __syncthreads(); while (shift <= max_active_thread / step) { - if (shift <= id && shift + STEP_WIDTH > id && threadIdx.x == 0) { - reducer.join(&result[id % STEP_WIDTH], &value); + if (shift <= id && shift + step_width > id && threadIdx.x == 0) { + reducer.join(&result[id % step_width], &value); } __syncthreads(); - shift += STEP_WIDTH; + shift += step_width; } value = result[0]; - for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++) + for (int i = 1; (i * step < max_active_thread) && i < step_width; i++) reducer.join(&value, &result[i]); __syncthreads(); } From ae585b7f03d68656fb62b858ef47199eb1786053 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Tue, 31 Jan 2023 14:03:01 -0700 Subject: [PATCH 135/496] CUDA: Fix up comment --- core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index bee22bd2c7..59fdd13513 100644 --- a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -63,8 +63,8 @@ __device__ inline void cuda_inter_warp_reduction( ValueType& value, const ReducerType& reducer, const int max_active_thread = blockDim.y) { constexpr int step_width = 4; - // Depending on the ValueType _shared__ memory must be aligned up to 8byte - // boundaries The reason not to use ValueType directly is that for types with + // Depending on the ValueType, __shared__ memory must be aligned up to 8byte + // boundaries. The reason not to use ValueType directly is that for types with // constructors it could lead to race conditions alignas(alignof(ValueType) > alignof(double) ? alignof(ValueType) : alignof(double)) From f4c8f8d289f3b9d09f394bd2408ba83b501c09e4 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Tue, 31 Jan 2023 13:25:30 -0700 Subject: [PATCH 136/496] OpenMPTarget: Be scrupulous about macro naming and undefining --- core/src/Kokkos_OpenMPTargetSpace.hpp | 36 +++++++++---------- .../Kokkos_OpenMPTarget_Error.hpp | 2 +- .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp | 6 ++-- .../Kokkos_OpenMPTarget_Instance.cpp | 6 ++-- .../Kokkos_OpenMPTarget_Parallel.hpp | 22 ++++++------ 5 files changed, 36 insertions(+), 36 deletions(-) diff --git a/core/src/Kokkos_OpenMPTargetSpace.hpp b/core/src/Kokkos_OpenMPTargetSpace.hpp index 97f74eb5ff..ca015da379 100644 --- a/core/src/Kokkos_OpenMPTargetSpace.hpp +++ b/core/src/Kokkos_OpenMPTargetSpace.hpp @@ -236,9 +236,9 @@ struct DeepCopy 0) - OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast(src), n, 0, 0, - omp_get_default_device(), - omp_get_default_device())); + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { exec.fence( @@ -246,9 +246,9 @@ struct DeepCopy 0) - OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast(src), n, 0, 0, - omp_get_default_device(), - omp_get_default_device())); + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); } }; @@ -257,18 +257,18 @@ struct DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) - OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast(src), n, 0, 0, - omp_get_default_device(), - omp_get_initial_device())); + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { exec.fence( "Kokkos::Impl::DeepCopy: fence before " "copy"); if (n > 0) - OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast(src), n, 0, 0, - omp_get_default_device(), - omp_get_initial_device())); + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); } }; @@ -277,18 +277,18 @@ struct DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) - OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast(src), n, 0, 0, - omp_get_initial_device(), - omp_get_default_device())); + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { exec.fence( "Kokkos::Impl::DeepCopy: fence before " "copy"); if (n > 0) - OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast(src), n, 0, 0, - omp_get_initial_device(), - omp_get_default_device())); + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); } }; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp index 78c4af7d8b..fd0b47f151 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp @@ -36,7 +36,7 @@ inline void ompt_internal_safe_call(int e, const char* name, } } -#define OMPT_SAFE_CALL(call) \ +#define KOKKOS_IMPL_OMPT_SAFE_CALL(call) \ Kokkos::Impl::ompt_internal_safe_call(call, #call, __FILE__, __LINE__) } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp index c75f99134e..40da73ebc6 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp @@ -131,9 +131,9 @@ int* OpenMPTargetExec::get_lock_array(int num_teams) { for (int i = 0; i < lock_array_elem; ++i) h_lock_array[i] = 0; - OMPT_SAFE_CALL(omp_target_memcpy(m_lock_array, h_lock_array, m_lock_size, 0, - 0, omp_get_default_device(), - omp_get_initial_device())); + KOKKOS_IMPL_OMPT_SAFE_CALL( + omp_target_memcpy(m_lock_array, h_lock_array, m_lock_size, 0, 0, + omp_get_default_device(), omp_get_initial_device())); omp_target_free(h_lock_array, omp_get_initial_device()); } diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index b4efd8bddb..4a33961205 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -182,9 +182,9 @@ UniqueToken( "Kokkos::OpenMPTarget::m_uniquetoken_ptr", size)); std::vector h_buf(count, 0); - OMPT_SAFE_CALL(omp_target_memcpy(ptr, h_buf.data(), size, 0, 0, - omp_get_default_device(), - omp_get_initial_device())); + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(ptr, h_buf.data(), size, 0, 0, + omp_get_default_device(), + omp_get_initial_device())); Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr = ptr; } diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 1d01f2995c..71ce4b18f2 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -84,9 +84,9 @@ struct ParallelReduceCommon { static void memcpy_result(PointerType dest, PointerType src, size_t size, bool ptr_on_device) { if (ptr_on_device) { - OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0, - omp_get_default_device(), - omp_get_initial_device())); + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0, + omp_get_default_device(), + omp_get_initial_device())); } else { *dest = *src; } @@ -291,11 +291,11 @@ struct ParallelReduceSpecialize, // If there is no work to be done, copy back the initialized values and // exit. if (!ptr_on_device) - OMPT_SAFE_CALL(omp_target_memcpy( + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, omp_get_initial_device(), omp_get_default_device())); else - OMPT_SAFE_CALL(omp_target_memcpy( + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, omp_get_default_device(), omp_get_default_device())); @@ -372,11 +372,11 @@ struct ParallelReduceSpecialize, // If the result view is on the host, copy back the values via memcpy. if (!ptr_on_device) - OMPT_SAFE_CALL(omp_target_memcpy( + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, omp_get_initial_device(), omp_get_default_device())); else - OMPT_SAFE_CALL(omp_target_memcpy( + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, omp_get_default_device(), omp_get_default_device())); } @@ -1081,11 +1081,11 @@ struct ParallelReduceSpecialize, // If there is no work to be done, copy back the initialized values and // exit. if (!ptr_on_device) - OMPT_SAFE_CALL(omp_target_memcpy( + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, omp_get_initial_device(), omp_get_default_device())); else - OMPT_SAFE_CALL(omp_target_memcpy( + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, omp_get_default_device(), omp_get_default_device())); @@ -1145,11 +1145,11 @@ struct ParallelReduceSpecialize, // If the result view is on the host, copy back the values via memcpy. if (!ptr_on_device) - OMPT_SAFE_CALL(omp_target_memcpy( + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, omp_get_initial_device(), omp_get_default_device())); else - OMPT_SAFE_CALL(omp_target_memcpy( + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, omp_get_default_device(), omp_get_default_device())); } From 87535d8c70a33204ee9424b89812568c209b2bca Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Tue, 31 Jan 2023 13:39:03 -0700 Subject: [PATCH 137/496] ViewLayoutTiled: Be scrupulous about macro naming and undefining --- core/src/impl/Kokkos_ViewLayoutTiled.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/impl/Kokkos_ViewLayoutTiled.hpp b/core/src/impl/Kokkos_ViewLayoutTiled.hpp index 40d8b3fde6..957717f973 100644 --- a/core/src/impl/Kokkos_ViewLayoutTiled.hpp +++ b/core/src/impl/Kokkos_ViewLayoutTiled.hpp @@ -156,7 +156,7 @@ struct ViewOffset< //---------------------------------------- -#define DEBUG_OUTPUT_CHECK 0 +#define KOKKOS_IMPL_DEBUG_OUTPUT_CHECK 0 // Rank 2 template @@ -173,7 +173,7 @@ struct ViewOffset< : (((i0 & MASK_0) << SHIFT_1) + (i1 & MASK_1)); // ( tile_dim[1] * li0 + li1 ) -#if DEBUG_OUTPUT_CHECK +#if KOKKOS_IMPL_DEBUG_OUTPUT_CHECK std::cout << "Am I Outer Left? " << (outer_pattern == (Kokkos::Iterate::Left)) << std::endl; std::cout << "Am I Inner Left? " @@ -207,7 +207,7 @@ struct ViewOffset< : (((i0 & MASK_0) << (SHIFT_2 + SHIFT_1)) + ((i1 & MASK_1) << (SHIFT_2)) + (i2 & MASK_2)); -#if DEBUG_OUTPUT_CHECK +#if KOKKOS_IMPL_DEBUG_OUTPUT_CHECK std::cout << "Am I Outer Left? " << (outer_pattern == (Kokkos::Iterate::Left)) << std::endl; std::cout << "Am I Inner Left? " From d41a6df179c3db3f0aa7bfe8d36449f267502c9b Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Tue, 31 Jan 2023 13:43:40 -0700 Subject: [PATCH 138/496] HIP: Drop obsolete macro definition --- core/src/setup/Kokkos_Setup_HIP.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/setup/Kokkos_Setup_HIP.hpp b/core/src/setup/Kokkos_Setup_HIP.hpp index e4676ff65a..7b01866107 100644 --- a/core/src/setup/Kokkos_Setup_HIP.hpp +++ b/core/src/setup/Kokkos_Setup_HIP.hpp @@ -21,7 +21,6 @@ #define KOKKOS_IMPL_HIP_CLANG_WORKAROUND -#define HIP_ENABLE_PRINTF #include #include From 073ce8b9f4785a305f8fff9d77f1087d585fb0a7 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 1 Feb 2023 10:51:05 -0500 Subject: [PATCH 139/496] Try using oneAPI 2023.0.0 in SYCL+Cuda CI (#5813) * Try using oneAPI 2023.0.0 in SYCL+Cuda CI * Run apt-get clean also for an earlier step again --- .jenkins | 7 ++-- scripts/docker/Dockerfile.sycl | 66 +++++++--------------------------- 2 files changed, 17 insertions(+), 56 deletions(-) diff --git a/.jenkins b/.jenkins index 20dfae53df..3a1dfaab7d 100644 --- a/.jenkins +++ b/.jenkins @@ -101,12 +101,13 @@ pipeline { } steps { sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ + sh '''. /opt/intel/oneapi/setvars.sh --include-intel-llvm && \ + rm -rf build && mkdir -p build && cd build && \ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER=clang++ \ - -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-linker-warnings" \ + -DCMAKE_CXX_COMPILER=/opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm/clang++ \ + -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-unknown-cuda-version -Wno-sycl-target" \ -DKOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED=0 \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ARCH_VOLTA70=ON \ diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index 1b93199918..bda1197fc6 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -1,4 +1,4 @@ -ARG BASE=nvidia/cuda:10.2-devel +ARG BASE=nvidia/cuda:11.7.0-devel-ubuntu22.04 FROM $BASE RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub @@ -14,15 +14,6 @@ RUN apt-get update && apt-get install -y \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# unbuntu18.04-based images have libstdc++ that is lacking filesystem support -RUN apt-get update && \ - apt-get install -y software-properties-common && \ - add-apt-repository ppa:ubuntu-toolchain-r/test -y && \ - apt-get update && \ - apt-get install -y g++-9 && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ KEYDUMP_FILE=keydump && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \ @@ -46,46 +37,15 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO rm cmake* ENV PATH=${CMAKE_DIR}/bin:$PATH -ENV SYCL_DIR=/opt/sycl -RUN SYCL_VERSION=20221201 && \ - SYCL_URL=https://github.com/intel/llvm/archive/sycl-nightly && \ - SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \ - SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ - wget --quiet ${SYCL_URL}/${SYCL_ARCHIVE} && \ - mkdir llvm && \ - tar -xf ${SYCL_ARCHIVE} -C llvm --strip-components=1 && \ - cd llvm && \ - mkdir build && \ - cd build && \ - cmake -G Ninja \ - -DCMAKE_BUILD_TYPE=Release \ - -DLLVM_ENABLE_ASSERTIONS=ON \ - -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \ - -DLLVM_EXTERNAL_PROJECTS="sycl;llvm-spirv;opencl;xpti;xptifw;libdevice" \ - -DLLVM_EXTERNAL_SYCL_SOURCE_DIR=/scratch/llvm/sycl \ - -DLLVM_EXTERNAL_LLVM_SPIRV_SOURCE_DIR=/scratch/llvm/llvm-spirv \ - -DLLVM_EXTERNAL_XPTI_SOURCE_DIR=/scratch/llvm/xpti \ - -DXPTI_SOURCE_DIR=/scratch/llvm/xpti \ - -DLLVM_EXTERNAL_XPTIFW_SOURCE_DIR=/scratch/llvm/xptifw \ - -DLLVM_EXTERNAL_LIBDEVICE_SOURCE_DIR=/scratch/llvm/libdevice \ - -DLLVM_ENABLE_PROJECTS="clang;sycl;llvm-spirv;opencl;xpti;xptifw;libdevice;libclc" \ - -DLIBCLC_TARGETS_TO_BUILD=";nvptx64--;nvptx64--nvidiacl" \ - -DLIBCLC_GENERATE_REMANGLED_VARIANTS=ON \ - -DLLVM_BUILD_TOOLS=OFF \ - -DSYCL_ENABLE_WERROR=OFF \ - -DCMAKE_INSTALL_PREFIX=${SYCL_DIR} \ - -DSYCL_INCLUDE_TESTS=OFF \ - -DLLVM_ENABLE_DOXYGEN=OFF \ - -DLLVM_ENABLE_SPHINX=OFF \ - -DBUILD_SHARED_LIBS=OFF \ - -DSYCL_ENABLE_XPTI_TRACING=ON \ - -DLLVM_ENABLE_LLD=OFF \ - -DXPTI_ENABLE_WERROR=OFF \ - -DSYCL_ENABLE_PLUGINS="opencl;cuda" \ - /scratch/llvm/llvm && \ - ninja -j8 deploy-sycl-toolchain && \ - ninja -j8 install && \ - cp bin/* ${SYCL_DIR}/bin && \ - echo "${SYCL_DIR}/lib" > /etc/ld.so.conf.d/sycl.conf && ldconfig && \ - rm -rf ${SCRATCH_DIR} -ENV PATH=${SYCL_DIR}/bin:$PATH +RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB && \ + apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB && \ + echo "deb https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ + apt-get update -o Dir::Etc::sourcelist="sources.list.d/oneAPI.list" -o APT::Get::List-Cleanup="0" && \ + apt-get install -y intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic-2023.0.0 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN wget https://cloud.cees.ornl.gov/download/oneapi-for-nvidia-gpus-2023.0.0-linux.sh && \ + chmod +x oneapi-for-nvidia-gpus-2023.0.0-linux.sh && \ + ./oneapi-for-nvidia-gpus-2023.0.0-linux.sh -y && \ + rm oneapi-for-nvidia-gpus-2023.0.0-linux.sh From dac21c753ec3e4c7663c0aff5992636a4f5a88e0 Mon Sep 17 00:00:00 2001 From: Sebastian Eibl Date: Wed, 1 Feb 2023 16:56:36 +0100 Subject: [PATCH 140/496] Add non-standard `rsqrt` math function (#5644) * adding rsqrt * introduce non-standard math section and SYCL * fix division by zero * fix long double literal * move special functions to own group * test if device functions also work on host * fix namespace issue * only nvcc is providing host compatible versions of math functions * rearrangement --- core/src/Kokkos_MathematicalFunctions.hpp | 32 +++++++++++++ core/unit_test/TestMathematicalFunctions.hpp | 50 ++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/core/src/Kokkos_MathematicalFunctions.hpp b/core/src/Kokkos_MathematicalFunctions.hpp index 5016249edc..a795f700fa 100644 --- a/core/src/Kokkos_MathematicalFunctions.hpp +++ b/core/src/Kokkos_MathematicalFunctions.hpp @@ -485,6 +485,38 @@ KOKKOS_IMPL_MATH_UNARY_PREDICATE(signbit) #undef KOKKOS_IMPL_MATH_BINARY_FUNCTION #undef KOKKOS_IMPL_MATH_TERNARY_FUNCTION +// non-standard math functions provided by CUDA/HIP/SYCL +KOKKOS_INLINE_FUNCTION float rsqrt(float val) { +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + KOKKOS_IF_ON_DEVICE(return ::rsqrtf(val);) + KOKKOS_IF_ON_HOST(return 1.0f / Kokkos::sqrt(val);) +#elif defined(KOKKOS_ENABLE_SYCL) + KOKKOS_IF_ON_DEVICE(return sycl::rsqrt(val);) + KOKKOS_IF_ON_HOST(return 1.0f / Kokkos::sqrt(val);) +#else + return 1.0f / Kokkos::sqrt(val); +#endif +} +KOKKOS_INLINE_FUNCTION double rsqrt(double val) { +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + KOKKOS_IF_ON_DEVICE(return ::rsqrt(val);) + KOKKOS_IF_ON_HOST(return 1.0 / Kokkos::sqrt(val);) +#elif defined(KOKKOS_ENABLE_SYCL) + KOKKOS_IF_ON_DEVICE(return sycl::rsqrt(val);) + KOKKOS_IF_ON_HOST(return 1.0 / Kokkos::sqrt(val);) +#else + return 1.0 / Kokkos::sqrt(val); +#endif +} +inline long double rsqrt(long double val) { return 1.0l / Kokkos::sqrt(val); } +KOKKOS_INLINE_FUNCTION float rsqrtf(float x) { return Kokkos::rsqrt(x); } +inline long double rsqrtl(long double x) { return Kokkos::rsqrt(x); } +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, double> rsqrt( + T x) { + return Kokkos::rsqrt(static_cast(x)); +} + } // namespace Kokkos #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHFUNCTIONS diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index eda880d1de..dfcd2340a2 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -286,6 +286,31 @@ struct math_function_name; }; \ constexpr char math_function_name::name[] +#define DEFINE_UNARY_FUNCTION_EVAL_CUSTOM(FUNC, ULP_FACTOR, REF_FUNC) \ + struct MathUnaryFunction_##FUNC { \ + template \ + static KOKKOS_FUNCTION auto eval(T x) { \ + static_assert( \ + std::is_same>::value); \ + return Kokkos::FUNC(x); \ + } \ + template \ + static auto eval_std(T x) { \ + static_assert( \ + std::is_same>::value); \ + return REF_FUNC; \ + } \ + static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + }; \ + using kk_##FUNC = MathUnaryFunction_##FUNC; \ + template <> \ + struct math_function_name { \ + static constexpr char name[] = #FUNC; \ + }; \ + constexpr char math_function_name::name[] + #ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 // Generally the expected ULP error should come from here: // https://www.gnu.org/software/libc/manual/html_node/Errors-in-Math-Functions.html @@ -319,6 +344,10 @@ DEFINE_UNARY_FUNCTION_EVAL(tanh, 2); DEFINE_UNARY_FUNCTION_EVAL(asinh, 4); DEFINE_UNARY_FUNCTION_EVAL(acosh, 2); DEFINE_UNARY_FUNCTION_EVAL(atanh, 2); + +// non-standard math functions +DEFINE_UNARY_FUNCTION_EVAL_CUSTOM(rsqrt, 2, + decltype(std::sqrt(x))(1) / std::sqrt(x)); #endif #ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 @@ -468,6 +497,13 @@ template void do_test_math_unary_function(const Arg (&x)[N]) { (void)std::initializer_list{ (TestMathUnaryFunction(x), 0)...}; + + // test if potentially device specific math functions also work on host + if constexpr (!std::is_same_v) + (void)std::initializer_list{ + (TestMathUnaryFunction( + x), + 0)...}; } #define TEST_MATH_FUNCTION(FUNC) \ @@ -859,6 +895,20 @@ TEST(TEST_CATEGORY, mathematical_functions_hyperbolic_functions) { TEST_MATH_FUNCTION(atanh)({-.97l, .86l, -.53l, .42l, -.1l, 0.l}); #endif } + +TEST(TEST_CATEGORY, mathematical_functions_non_standard) { + TEST_MATH_FUNCTION(rsqrt)({1, 2, 3, 5, 7, 11}); + TEST_MATH_FUNCTION(rsqrt)({1l, 2l, 3l, 5l, 7l, 11l}); + TEST_MATH_FUNCTION(rsqrt)({1ll, 2ll, 3ll, 5ll, 7ll, 11ll}); + TEST_MATH_FUNCTION(rsqrt)({1u, 2u, 3u, 5u, 7u}); + TEST_MATH_FUNCTION(rsqrt)({1ul, 2ul, 3ul, 5ul, 7ul}); + TEST_MATH_FUNCTION(rsqrt)({1ull, 2ull, 3ull, 5ull, 7ull}); + TEST_MATH_FUNCTION(rsqrt)({10.f, 20.f, 30.f, 40.f}); + TEST_MATH_FUNCTION(rsqrt)({11.1, 22.2, 33.3, 44.4}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(rsqrt)({10.l, 20.l, 30.l, 40.l}); +#endif +} #endif #ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 From f3d9efbd59fcf648a07fdebc78479f9d5992647c Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 31 Jan 2023 16:17:13 -0500 Subject: [PATCH 141/496] Fix unprefixed macros on KokkosExp_Host_IterateTile.hpp --- core/src/impl/KokkosExp_Host_IterateTile.hpp | 1943 ++++++++++-------- 1 file changed, 1113 insertions(+), 830 deletions(-) diff --git a/core/src/impl/KokkosExp_Host_IterateTile.hpp b/core/src/impl/KokkosExp_Host_IterateTile.hpp index e2b606004f..82604a24c2 100644 --- a/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -37,1167 +37,1278 @@ namespace Impl { // Temporary, for testing new loop macros #define KOKKOS_ENABLE_NEW_LOOP_MACROS 1 -#define LOOP_1L(type, tile) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ +#define KOKKOS_IMPL_LOOP_1L(type, tile) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = 0; i0 < static_cast(tile[0]); ++i0) -#define LOOP_2L(type, tile) \ - for (type i1 = 0; i1 < static_cast(tile[1]); ++i1) LOOP_1L(type, tile) +#define KOKKOS_IMPL_LOOP_2L(type, tile) \ + for (type i1 = 0; i1 < static_cast(tile[1]); ++i1) \ + KOKKOS_IMPL_LOOP_1L(type, tile) -#define LOOP_3L(type, tile) \ - for (type i2 = 0; i2 < static_cast(tile[2]); ++i2) LOOP_2L(type, tile) +#define KOKKOS_IMPL_LOOP_3L(type, tile) \ + for (type i2 = 0; i2 < static_cast(tile[2]); ++i2) \ + KOKKOS_IMPL_LOOP_2L(type, tile) -#define LOOP_4L(type, tile) \ - for (type i3 = 0; i3 < static_cast(tile[3]); ++i3) LOOP_3L(type, tile) +#define KOKKOS_IMPL_LOOP_4L(type, tile) \ + for (type i3 = 0; i3 < static_cast(tile[3]); ++i3) \ + KOKKOS_IMPL_LOOP_3L(type, tile) -#define LOOP_5L(type, tile) \ - for (type i4 = 0; i4 < static_cast(tile[4]); ++i4) LOOP_4L(type, tile) +#define KOKKOS_IMPL_LOOP_5L(type, tile) \ + for (type i4 = 0; i4 < static_cast(tile[4]); ++i4) \ + KOKKOS_IMPL_LOOP_4L(type, tile) -#define LOOP_6L(type, tile) \ - for (type i5 = 0; i5 < static_cast(tile[5]); ++i5) LOOP_5L(type, tile) +#define KOKKOS_IMPL_LOOP_6L(type, tile) \ + for (type i5 = 0; i5 < static_cast(tile[5]); ++i5) \ + KOKKOS_IMPL_LOOP_5L(type, tile) -#define LOOP_7L(type, tile) \ - for (type i6 = 0; i6 < static_cast(tile[6]); ++i6) LOOP_6L(type, tile) +#define KOKKOS_IMPL_LOOP_7L(type, tile) \ + for (type i6 = 0; i6 < static_cast(tile[6]); ++i6) \ + KOKKOS_IMPL_LOOP_6L(type, tile) -#define LOOP_8L(type, tile) \ - for (type i7 = 0; i7 < static_cast(tile[7]); ++i7) LOOP_7L(type, tile) +#define KOKKOS_IMPL_LOOP_8L(type, tile) \ + for (type i7 = 0; i7 < static_cast(tile[7]); ++i7) \ + KOKKOS_IMPL_LOOP_7L(type, tile) -#define LOOP_1R(type, tile) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ +#define KOKKOS_IMPL_LOOP_1R(type, tile) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = 0; i0 < static_cast(tile[0]); ++i0) -#define LOOP_2R(type, tile) \ - LOOP_1R(type, tile) \ +#define KOKKOS_IMPL_LOOP_2R(type, tile) \ + KOKKOS_IMPL_LOOP_1R(type, tile) \ for (type i1 = 0; i1 < static_cast(tile[1]); ++i1) -#define LOOP_3R(type, tile) \ - LOOP_2R(type, tile) \ +#define KOKKOS_IMPL_LOOP_3R(type, tile) \ + KOKKOS_IMPL_LOOP_2R(type, tile) \ for (type i2 = 0; i2 < static_cast(tile[2]); ++i2) -#define LOOP_4R(type, tile) \ - LOOP_3R(type, tile) \ +#define KOKKOS_IMPL_LOOP_4R(type, tile) \ + KOKKOS_IMPL_LOOP_3R(type, tile) \ for (type i3 = 0; i3 < static_cast(tile[3]); ++i3) -#define LOOP_5R(type, tile) \ - LOOP_4R(type, tile) \ +#define KOKKOS_IMPL_LOOP_5R(type, tile) \ + KOKKOS_IMPL_LOOP_4R(type, tile) \ for (type i4 = 0; i4 < static_cast(tile[4]); ++i4) -#define LOOP_6R(type, tile) \ - LOOP_5R(type, tile) \ +#define KOKKOS_IMPL_LOOP_6R(type, tile) \ + KOKKOS_IMPL_LOOP_5R(type, tile) \ for (type i5 = 0; i5 < static_cast(tile[5]); ++i5) -#define LOOP_7R(type, tile) \ - LOOP_6R(type, tile) \ +#define KOKKOS_IMPL_LOOP_7R(type, tile) \ + KOKKOS_IMPL_LOOP_6R(type, tile) \ for (type i6 = 0; i6 < static_cast(tile[6]); ++i6) -#define LOOP_8R(type, tile) \ - LOOP_7R(type, tile) \ +#define KOKKOS_IMPL_LOOP_8R(type, tile) \ + KOKKOS_IMPL_LOOP_7R(type, tile) \ for (type i7 = 0; i7 < static_cast(tile[7]); ++i7) -#define LOOP_ARGS_1 i0 + m_offset[0] -#define LOOP_ARGS_2 LOOP_ARGS_1, i1 + m_offset[1] -#define LOOP_ARGS_3 LOOP_ARGS_2, i2 + m_offset[2] -#define LOOP_ARGS_4 LOOP_ARGS_3, i3 + m_offset[3] -#define LOOP_ARGS_5 LOOP_ARGS_4, i4 + m_offset[4] -#define LOOP_ARGS_6 LOOP_ARGS_5, i5 + m_offset[5] -#define LOOP_ARGS_7 LOOP_ARGS_6, i6 + m_offset[6] -#define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7] +#define KOKKOS_IMPL_LOOP_ARGS_1 i0 + m_offset[0] +#define KOKKOS_IMPL_LOOP_ARGS_2 KOKKOS_IMPL_LOOP_ARGS_1, i1 + m_offset[1] +#define KOKKOS_IMPL_LOOP_ARGS_3 KOKKOS_IMPL_LOOP_ARGS_2, i2 + m_offset[2] +#define KOKKOS_IMPL_LOOP_ARGS_4 KOKKOS_IMPL_LOOP_ARGS_3, i3 + m_offset[3] +#define KOKKOS_IMPL_LOOP_ARGS_5 KOKKOS_IMPL_LOOP_ARGS_4, i4 + m_offset[4] +#define KOKKOS_IMPL_LOOP_ARGS_6 KOKKOS_IMPL_LOOP_ARGS_5, i5 + m_offset[5] +#define KOKKOS_IMPL_LOOP_ARGS_7 KOKKOS_IMPL_LOOP_ARGS_6, i6 + m_offset[6] +#define KOKKOS_IMPL_LOOP_ARGS_8 KOKKOS_IMPL_LOOP_ARGS_7, i7 + m_offset[7] // New Loop Macros... // parallel_for, non-tagged -#define APPLY(func, ...) func(__VA_ARGS__); +#define KOKKOS_IMPL_APPLY(func, ...) func(__VA_ARGS__); // LayoutRight // d = 0 to start -#define LOOP_R_1(func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_LOOP_R_1(func, type, m_offset, extent, d, ...) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ - APPLY(func, __VA_ARGS__, i0 + m_offset[d]) \ + KOKKOS_IMPL_APPLY(func, __VA_ARGS__, i0 + m_offset[d]) \ } -#define LOOP_R_2(func, type, m_offset, extent, d, ...) \ - for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ - LOOP_R_1(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i1 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_2(func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ + KOKKOS_IMPL_LOOP_R_1(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i1 + m_offset[d]) \ } -#define LOOP_R_3(func, type, m_offset, extent, d, ...) \ - for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ - LOOP_R_2(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i2 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_3(func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ + KOKKOS_IMPL_LOOP_R_2(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i2 + m_offset[d]) \ } -#define LOOP_R_4(func, type, m_offset, extent, d, ...) \ - for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ - LOOP_R_3(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i3 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_4(func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ + KOKKOS_IMPL_LOOP_R_3(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i3 + m_offset[d]) \ } -#define LOOP_R_5(func, type, m_offset, extent, d, ...) \ - for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ - LOOP_R_4(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i4 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_5(func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ + KOKKOS_IMPL_LOOP_R_4(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i4 + m_offset[d]) \ } -#define LOOP_R_6(func, type, m_offset, extent, d, ...) \ - for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ - LOOP_R_5(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i5 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_6(func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ + KOKKOS_IMPL_LOOP_R_5(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i5 + m_offset[d]) \ } -#define LOOP_R_7(func, type, m_offset, extent, d, ...) \ - for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ - LOOP_R_6(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i6 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_7(func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ + KOKKOS_IMPL_LOOP_R_6(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i6 + m_offset[d]) \ } -#define LOOP_R_8(func, type, m_offset, extent, d, ...) \ - for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ - LOOP_R_7(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i7 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_8(func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ + KOKKOS_IMPL_LOOP_R_7(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i7 + m_offset[d]) \ } // LayoutLeft // d = rank-1 to start -#define LOOP_L_1(func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_LOOP_L_1(func, type, m_offset, extent, d, ...) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ - APPLY(func, i0 + m_offset[d], __VA_ARGS__) \ + KOKKOS_IMPL_APPLY(func, i0 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_2(func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_LOOP_L_2(func, type, m_offset, extent, d, ...) \ for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ - LOOP_L_1(func, type, m_offset, extent, d - 1, i1 + m_offset[d], \ - __VA_ARGS__) \ + KOKKOS_IMPL_LOOP_L_1(func, type, m_offset, extent, d - 1, \ + i1 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_3(func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_LOOP_L_3(func, type, m_offset, extent, d, ...) \ for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ - LOOP_L_2(func, type, m_offset, extent, d - 1, i2 + m_offset[d], \ - __VA_ARGS__) \ + KOKKOS_IMPL_LOOP_L_2(func, type, m_offset, extent, d - 1, \ + i2 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_4(func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_LOOP_L_4(func, type, m_offset, extent, d, ...) \ for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ - LOOP_L_3(func, type, m_offset, extent, d - 1, i3 + m_offset[d], \ - __VA_ARGS__) \ + KOKKOS_IMPL_LOOP_L_3(func, type, m_offset, extent, d - 1, \ + i3 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_5(func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_LOOP_L_5(func, type, m_offset, extent, d, ...) \ for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ - LOOP_L_4(func, type, m_offset, extent, d - 1, i4 + m_offset[d], \ - __VA_ARGS__) \ + KOKKOS_IMPL_LOOP_L_4(func, type, m_offset, extent, d - 1, \ + i4 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_6(func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_LOOP_L_6(func, type, m_offset, extent, d, ...) \ for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ - LOOP_L_5(func, type, m_offset, extent, d - 1, i5 + m_offset[d], \ - __VA_ARGS__) \ + KOKKOS_IMPL_LOOP_L_5(func, type, m_offset, extent, d - 1, \ + i5 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_7(func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_LOOP_L_7(func, type, m_offset, extent, d, ...) \ for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ - LOOP_L_6(func, type, m_offset, extent, d - 1, i6 + m_offset[d], \ - __VA_ARGS__) \ + KOKKOS_IMPL_LOOP_L_6(func, type, m_offset, extent, d - 1, \ + i6 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_8(func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_LOOP_L_8(func, type, m_offset, extent, d, ...) \ for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ - LOOP_L_7(func, type, m_offset, extent, d - 1, i7 + m_offset[d], \ - __VA_ARGS__) \ + KOKKOS_IMPL_LOOP_L_7(func, type, m_offset, extent, d - 1, \ + i7 + m_offset[d], __VA_ARGS__) \ } // Left vs Right // TODO: rank not necessary to pass through, can hardcode the values -#define LOOP_LAYOUT_1(func, type, is_left, m_offset, extent, rank) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ - for (type i0 = (type)0; i0 < static_cast(extent[0]); ++i0) { \ - APPLY(func, i0 + m_offset[0]) \ +#define KOKKOS_IMPL_LOOP_LAYOUT_1(func, type, is_left, m_offset, extent, rank) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast(extent[0]); ++i0) { \ + KOKKOS_IMPL_APPLY(func, i0 + m_offset[0]) \ } -#define LOOP_LAYOUT_2(func, type, is_left, m_offset, extent, rank) \ - if (is_left) { \ - for (type i1 = (type)0; i1 < static_cast(extent[rank - 1]); ++i1) { \ - LOOP_L_1(func, type, m_offset, extent, rank - 2, \ - i1 + m_offset[rank - 1]) \ - } \ - } else { \ - for (type i1 = (type)0; i1 < static_cast(extent[0]); ++i1) { \ - LOOP_R_1(func, type, m_offset, extent, 1, i1 + m_offset[0]) \ - } \ +#define KOKKOS_IMPL_LOOP_LAYOUT_2(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i1 = (type)0; i1 < static_cast(extent[rank - 1]); ++i1) { \ + KOKKOS_IMPL_LOOP_L_1(func, type, m_offset, extent, rank - 2, \ + i1 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i1 = (type)0; i1 < static_cast(extent[0]); ++i1) { \ + KOKKOS_IMPL_LOOP_R_1(func, type, m_offset, extent, 1, i1 + m_offset[0]) \ + } \ } -#define LOOP_LAYOUT_3(func, type, is_left, m_offset, extent, rank) \ - if (is_left) { \ - for (type i2 = (type)0; i2 < static_cast(extent[rank - 1]); ++i2) { \ - LOOP_L_2(func, type, m_offset, extent, rank - 2, \ - i2 + m_offset[rank - 1]) \ - } \ - } else { \ - for (type i2 = (type)0; i2 < static_cast(extent[0]); ++i2) { \ - LOOP_R_2(func, type, m_offset, extent, 1, i2 + m_offset[0]) \ - } \ +#define KOKKOS_IMPL_LOOP_LAYOUT_3(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i2 = (type)0; i2 < static_cast(extent[rank - 1]); ++i2) { \ + KOKKOS_IMPL_LOOP_L_2(func, type, m_offset, extent, rank - 2, \ + i2 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i2 = (type)0; i2 < static_cast(extent[0]); ++i2) { \ + KOKKOS_IMPL_LOOP_R_2(func, type, m_offset, extent, 1, i2 + m_offset[0]) \ + } \ } -#define LOOP_LAYOUT_4(func, type, is_left, m_offset, extent, rank) \ - if (is_left) { \ - for (type i3 = (type)0; i3 < static_cast(extent[rank - 1]); ++i3) { \ - LOOP_L_3(func, type, m_offset, extent, rank - 2, \ - i3 + m_offset[rank - 1]) \ - } \ - } else { \ - for (type i3 = (type)0; i3 < static_cast(extent[0]); ++i3) { \ - LOOP_R_3(func, type, m_offset, extent, 1, i3 + m_offset[0]) \ - } \ +#define KOKKOS_IMPL_LOOP_LAYOUT_4(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i3 = (type)0; i3 < static_cast(extent[rank - 1]); ++i3) { \ + KOKKOS_IMPL_LOOP_L_3(func, type, m_offset, extent, rank - 2, \ + i3 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i3 = (type)0; i3 < static_cast(extent[0]); ++i3) { \ + KOKKOS_IMPL_LOOP_R_3(func, type, m_offset, extent, 1, i3 + m_offset[0]) \ + } \ } -#define LOOP_LAYOUT_5(func, type, is_left, m_offset, extent, rank) \ - if (is_left) { \ - for (type i4 = (type)0; i4 < static_cast(extent[rank - 1]); ++i4) { \ - LOOP_L_4(func, type, m_offset, extent, rank - 2, \ - i4 + m_offset[rank - 1]) \ - } \ - } else { \ - for (type i4 = (type)0; i4 < static_cast(extent[0]); ++i4) { \ - LOOP_R_4(func, type, m_offset, extent, 1, i4 + m_offset[0]) \ - } \ +#define KOKKOS_IMPL_LOOP_LAYOUT_5(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i4 = (type)0; i4 < static_cast(extent[rank - 1]); ++i4) { \ + KOKKOS_IMPL_LOOP_L_4(func, type, m_offset, extent, rank - 2, \ + i4 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i4 = (type)0; i4 < static_cast(extent[0]); ++i4) { \ + KOKKOS_IMPL_LOOP_R_4(func, type, m_offset, extent, 1, i4 + m_offset[0]) \ + } \ } -#define LOOP_LAYOUT_6(func, type, is_left, m_offset, extent, rank) \ - if (is_left) { \ - for (type i5 = (type)0; i5 < static_cast(extent[rank - 1]); ++i5) { \ - LOOP_L_5(func, type, m_offset, extent, rank - 2, \ - i5 + m_offset[rank - 1]) \ - } \ - } else { \ - for (type i5 = (type)0; i5 < static_cast(extent[0]); ++i5) { \ - LOOP_R_5(func, type, m_offset, extent, 1, i5 + m_offset[0]) \ - } \ +#define KOKKOS_IMPL_LOOP_LAYOUT_6(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i5 = (type)0; i5 < static_cast(extent[rank - 1]); ++i5) { \ + KOKKOS_IMPL_LOOP_L_5(func, type, m_offset, extent, rank - 2, \ + i5 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i5 = (type)0; i5 < static_cast(extent[0]); ++i5) { \ + KOKKOS_IMPL_LOOP_R_5(func, type, m_offset, extent, 1, i5 + m_offset[0]) \ + } \ } -#define LOOP_LAYOUT_7(func, type, is_left, m_offset, extent, rank) \ - if (is_left) { \ - for (type i6 = (type)0; i6 < static_cast(extent[rank - 1]); ++i6) { \ - LOOP_L_6(func, type, m_offset, extent, rank - 2, \ - i6 + m_offset[rank - 1]) \ - } \ - } else { \ - for (type i6 = (type)0; i6 < static_cast(extent[0]); ++i6) { \ - LOOP_R_6(func, type, m_offset, extent, 1, i6 + m_offset[0]) \ - } \ +#define KOKKOS_IMPL_LOOP_LAYOUT_7(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i6 = (type)0; i6 < static_cast(extent[rank - 1]); ++i6) { \ + KOKKOS_IMPL_LOOP_L_6(func, type, m_offset, extent, rank - 2, \ + i6 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i6 = (type)0; i6 < static_cast(extent[0]); ++i6) { \ + KOKKOS_IMPL_LOOP_R_6(func, type, m_offset, extent, 1, i6 + m_offset[0]) \ + } \ } -#define LOOP_LAYOUT_8(func, type, is_left, m_offset, extent, rank) \ - if (is_left) { \ - for (type i7 = (type)0; i7 < static_cast(extent[rank - 1]); ++i7) { \ - LOOP_L_7(func, type, m_offset, extent, rank - 2, \ - i7 + m_offset[rank - 1]) \ - } \ - } else { \ - for (type i7 = (type)0; i7 < static_cast(extent[0]); ++i7) { \ - LOOP_R_7(func, type, m_offset, extent, 1, i7 + m_offset[0]) \ - } \ +#define KOKKOS_IMPL_LOOP_LAYOUT_8(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i7 = (type)0; i7 < static_cast(extent[rank - 1]); ++i7) { \ + KOKKOS_IMPL_LOOP_L_7(func, type, m_offset, extent, rank - 2, \ + i7 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i7 = (type)0; i7 < static_cast(extent[0]); ++i7) { \ + KOKKOS_IMPL_LOOP_R_7(func, type, m_offset, extent, 1, i7 + m_offset[0]) \ + } \ } // Partial vs Full Tile -#define TILE_LOOP_1(func, type, is_left, cond, m_offset, extent_full, \ - extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_1(func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_1(func, type, is_left, m_offset, extent_partial, rank) \ +#define KOKKOS_IMPL_TILE_LOOP_1(func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_1(func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_1(func, type, is_left, m_offset, extent_partial, \ + rank) \ } -#define TILE_LOOP_2(func, type, is_left, cond, m_offset, extent_full, \ - extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_2(func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_2(func, type, is_left, m_offset, extent_partial, rank) \ +#define KOKKOS_IMPL_TILE_LOOP_2(func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_2(func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_2(func, type, is_left, m_offset, extent_partial, \ + rank) \ } -#define TILE_LOOP_3(func, type, is_left, cond, m_offset, extent_full, \ - extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_3(func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_3(func, type, is_left, m_offset, extent_partial, rank) \ +#define KOKKOS_IMPL_TILE_LOOP_3(func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_3(func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_3(func, type, is_left, m_offset, extent_partial, \ + rank) \ } -#define TILE_LOOP_4(func, type, is_left, cond, m_offset, extent_full, \ - extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_4(func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_4(func, type, is_left, m_offset, extent_partial, rank) \ +#define KOKKOS_IMPL_TILE_LOOP_4(func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_4(func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_4(func, type, is_left, m_offset, extent_partial, \ + rank) \ } -#define TILE_LOOP_5(func, type, is_left, cond, m_offset, extent_full, \ - extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_5(func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_5(func, type, is_left, m_offset, extent_partial, rank) \ +#define KOKKOS_IMPL_TILE_LOOP_5(func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_5(func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_5(func, type, is_left, m_offset, extent_partial, \ + rank) \ } -#define TILE_LOOP_6(func, type, is_left, cond, m_offset, extent_full, \ - extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_6(func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_6(func, type, is_left, m_offset, extent_partial, rank) \ +#define KOKKOS_IMPL_TILE_LOOP_6(func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_6(func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_6(func, type, is_left, m_offset, extent_partial, \ + rank) \ } -#define TILE_LOOP_7(func, type, is_left, cond, m_offset, extent_full, \ - extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_7(func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_7(func, type, is_left, m_offset, extent_partial, rank) \ +#define KOKKOS_IMPL_TILE_LOOP_7(func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_7(func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_7(func, type, is_left, m_offset, extent_partial, \ + rank) \ } -#define TILE_LOOP_8(func, type, is_left, cond, m_offset, extent_full, \ - extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_8(func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_8(func, type, is_left, m_offset, extent_partial, rank) \ +#define KOKKOS_IMPL_TILE_LOOP_8(func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_8(func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_8(func, type, is_left, m_offset, extent_partial, \ + rank) \ } // parallel_reduce, non-tagged // Reduction version -#define APPLY_REDUX(val, func, ...) func(__VA_ARGS__, val); +#define KOKKOS_IMPL_APPLY_REDUX(val, func, ...) func(__VA_ARGS__, val); // LayoutRight // d = 0 to start -#define LOOP_R_1_REDUX(val, func, type, m_offset, extent, d, ...) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ - for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ - APPLY_REDUX(val, func, __VA_ARGS__, i0 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_1_REDUX(val, func, type, m_offset, extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ + KOKKOS_IMPL_APPLY_REDUX(val, func, __VA_ARGS__, i0 + m_offset[d]) \ } -#define LOOP_R_2_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ - LOOP_R_1_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i1 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_2_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ + KOKKOS_IMPL_LOOP_R_1_REDUX(val, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i1 + m_offset[d]) \ } -#define LOOP_R_3_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ - LOOP_R_2_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i2 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_3_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ + KOKKOS_IMPL_LOOP_R_2_REDUX(val, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i2 + m_offset[d]) \ } -#define LOOP_R_4_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ - LOOP_R_3_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i3 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_4_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ + KOKKOS_IMPL_LOOP_R_3_REDUX(val, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i3 + m_offset[d]) \ } -#define LOOP_R_5_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ - LOOP_R_4_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i4 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_5_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ + KOKKOS_IMPL_LOOP_R_4_REDUX(val, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i4 + m_offset[d]) \ } -#define LOOP_R_6_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ - LOOP_R_5_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i5 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_6_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ + KOKKOS_IMPL_LOOP_R_5_REDUX(val, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i5 + m_offset[d]) \ } -#define LOOP_R_7_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ - LOOP_R_6_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i6 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_7_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ + KOKKOS_IMPL_LOOP_R_6_REDUX(val, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i6 + m_offset[d]) \ } -#define LOOP_R_8_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ - LOOP_R_7_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i7 + m_offset[d]) \ +#define KOKKOS_IMPL_LOOP_R_8_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ + KOKKOS_IMPL_LOOP_R_7_REDUX(val, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i7 + m_offset[d]) \ } // LayoutLeft // d = rank-1 to start -#define LOOP_L_1_REDUX(val, func, type, m_offset, extent, d, ...) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ - for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ - APPLY_REDUX(val, func, i0 + m_offset[d], __VA_ARGS__) \ +#define KOKKOS_IMPL_LOOP_L_1_REDUX(val, func, type, m_offset, extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ + KOKKOS_IMPL_APPLY_REDUX(val, func, i0 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_2_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ - LOOP_L_1_REDUX(val, func, type, m_offset, extent, d - 1, i1 + m_offset[d], \ - __VA_ARGS__) \ +#define KOKKOS_IMPL_LOOP_L_2_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ + KOKKOS_IMPL_LOOP_L_1_REDUX(val, func, type, m_offset, extent, d - 1, \ + i1 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_3_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ - LOOP_L_2_REDUX(val, func, type, m_offset, extent, d - 1, i2 + m_offset[d], \ - __VA_ARGS__) \ +#define KOKKOS_IMPL_LOOP_L_3_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ + KOKKOS_IMPL_LOOP_L_2_REDUX(val, func, type, m_offset, extent, d - 1, \ + i2 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_4_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ - LOOP_L_3_REDUX(val, func, type, m_offset, extent, d - 1, i3 + m_offset[d], \ - __VA_ARGS__) \ +#define KOKKOS_IMPL_LOOP_L_4_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ + KOKKOS_IMPL_LOOP_L_3_REDUX(val, func, type, m_offset, extent, d - 1, \ + i3 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_5_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ - LOOP_L_4_REDUX(val, func, type, m_offset, extent, d - 1, i4 + m_offset[d], \ - __VA_ARGS__) \ +#define KOKKOS_IMPL_LOOP_L_5_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ + KOKKOS_IMPL_LOOP_L_4_REDUX(val, func, type, m_offset, extent, d - 1, \ + i4 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_6_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ - LOOP_L_5_REDUX(val, func, type, m_offset, extent, d - 1, i5 + m_offset[d], \ - __VA_ARGS__) \ +#define KOKKOS_IMPL_LOOP_L_6_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ + KOKKOS_IMPL_LOOP_L_5_REDUX(val, func, type, m_offset, extent, d - 1, \ + i5 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_7_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ - LOOP_L_6_REDUX(val, func, type, m_offset, extent, d - 1, i6 + m_offset[d], \ - __VA_ARGS__) \ +#define KOKKOS_IMPL_LOOP_L_7_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ + KOKKOS_IMPL_LOOP_L_6_REDUX(val, func, type, m_offset, extent, d - 1, \ + i6 + m_offset[d], __VA_ARGS__) \ } -#define LOOP_L_8_REDUX(val, func, type, m_offset, extent, d, ...) \ - for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ - LOOP_L_7_REDUX(val, func, type, m_offset, extent, d - 1, i7 + m_offset[d], \ - __VA_ARGS__) \ +#define KOKKOS_IMPL_LOOP_L_8_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ + KOKKOS_IMPL_LOOP_L_7_REDUX(val, func, type, m_offset, extent, d - 1, \ + i7 + m_offset[d], __VA_ARGS__) \ } // Left vs Right -#define LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, extent, rank) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ - for (type i0 = (type)0; i0 < static_cast(extent[0]); ++i0) { \ - APPLY_REDUX(val, func, i0 + m_offset[0]) \ +#define KOKKOS_IMPL_LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, \ + extent, rank) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast(extent[0]); ++i0) { \ + KOKKOS_IMPL_APPLY_REDUX(val, func, i0 + m_offset[0]) \ } -#define LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i1 = (type)0; i1 < static_cast(extent[rank - 1]); ++i1) { \ - LOOP_L_1_REDUX(val, func, type, m_offset, extent, rank - 2, \ - i1 + m_offset[rank - 1]) \ + KOKKOS_IMPL_LOOP_L_1_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i1 + m_offset[rank - 1]) \ } \ } else { \ for (type i1 = (type)0; i1 < static_cast(extent[0]); ++i1) { \ - LOOP_R_1_REDUX(val, func, type, m_offset, extent, 1, i1 + m_offset[0]) \ + KOKKOS_IMPL_LOOP_R_1_REDUX(val, func, type, m_offset, extent, 1, \ + i1 + m_offset[0]) \ } \ } -#define LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i2 = (type)0; i2 < static_cast(extent[rank - 1]); ++i2) { \ - LOOP_L_2_REDUX(val, func, type, m_offset, extent, rank - 2, \ - i2 + m_offset[rank - 1]) \ + KOKKOS_IMPL_LOOP_L_2_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i2 + m_offset[rank - 1]) \ } \ } else { \ for (type i2 = (type)0; i2 < static_cast(extent[0]); ++i2) { \ - LOOP_R_2_REDUX(val, func, type, m_offset, extent, 1, i2 + m_offset[0]) \ + KOKKOS_IMPL_LOOP_R_2_REDUX(val, func, type, m_offset, extent, 1, \ + i2 + m_offset[0]) \ } \ } -#define LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i3 = (type)0; i3 < static_cast(extent[rank - 1]); ++i3) { \ - LOOP_L_3_REDUX(val, func, type, m_offset, extent, rank - 2, \ - i3 + m_offset[rank - 1]) \ + KOKKOS_IMPL_LOOP_L_3_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i3 + m_offset[rank - 1]) \ } \ } else { \ for (type i3 = (type)0; i3 < static_cast(extent[0]); ++i3) { \ - LOOP_R_3_REDUX(val, func, type, m_offset, extent, 1, i3 + m_offset[0]) \ + KOKKOS_IMPL_LOOP_R_3_REDUX(val, func, type, m_offset, extent, 1, \ + i3 + m_offset[0]) \ } \ } -#define LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i4 = (type)0; i4 < static_cast(extent[rank - 1]); ++i4) { \ - LOOP_L_4_REDUX(val, func, type, m_offset, extent, rank - 2, \ - i4 + m_offset[rank - 1]) \ + KOKKOS_IMPL_LOOP_L_4_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i4 + m_offset[rank - 1]) \ } \ } else { \ for (type i4 = (type)0; i4 < static_cast(extent[0]); ++i4) { \ - LOOP_R_4_REDUX(val, func, type, m_offset, extent, 1, i4 + m_offset[0]) \ + KOKKOS_IMPL_LOOP_R_4_REDUX(val, func, type, m_offset, extent, 1, \ + i4 + m_offset[0]) \ } \ } -#define LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i5 = (type)0; i5 < static_cast(extent[rank - 1]); ++i5) { \ - LOOP_L_5_REDUX(val, func, type, m_offset, extent, rank - 2, \ - i5 + m_offset[rank - 1]) \ + KOKKOS_IMPL_LOOP_L_5_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i5 + m_offset[rank - 1]) \ } \ } else { \ for (type i5 = (type)0; i5 < static_cast(extent[0]); ++i5) { \ - LOOP_R_5_REDUX(val, func, type, m_offset, extent, 1, i5 + m_offset[0]) \ + KOKKOS_IMPL_LOOP_R_5_REDUX(val, func, type, m_offset, extent, 1, \ + i5 + m_offset[0]) \ } \ } -#define LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i6 = (type)0; i6 < static_cast(extent[rank - 1]); ++i6) { \ - LOOP_L_6_REDUX(val, func, type, m_offset, extent, rank - 2, \ - i6 + m_offset[rank - 1]) \ + KOKKOS_IMPL_LOOP_L_6_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i6 + m_offset[rank - 1]) \ } \ } else { \ for (type i6 = (type)0; i6 < static_cast(extent[0]); ++i6) { \ - LOOP_R_6_REDUX(val, func, type, m_offset, extent, 1, i6 + m_offset[0]) \ + KOKKOS_IMPL_LOOP_R_6_REDUX(val, func, type, m_offset, extent, 1, \ + i6 + m_offset[0]) \ } \ } -#define LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i7 = (type)0; i7 < static_cast(extent[rank - 1]); ++i7) { \ - LOOP_L_7_REDUX(val, func, type, m_offset, extent, rank - 2, \ - i7 + m_offset[rank - 1]) \ + KOKKOS_IMPL_LOOP_L_7_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i7 + m_offset[rank - 1]) \ } \ } else { \ for (type i7 = (type)0; i7 < static_cast(extent[0]); ++i7) { \ - LOOP_R_7_REDUX(val, func, type, m_offset, extent, 1, i7 + m_offset[0]) \ + KOKKOS_IMPL_LOOP_R_7_REDUX(val, func, type, m_offset, extent, 1, \ + i7 + m_offset[0]) \ } \ } // Partial vs Full Tile -#define TILE_LOOP_1_REDUX(val, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TILE_LOOP_1_REDUX(val, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TILE_LOOP_2_REDUX(val, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TILE_LOOP_2_REDUX(val, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TILE_LOOP_3_REDUX(val, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TILE_LOOP_3_REDUX(val, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TILE_LOOP_4_REDUX(val, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TILE_LOOP_4_REDUX(val, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TILE_LOOP_5_REDUX(val, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TILE_LOOP_5_REDUX(val, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TILE_LOOP_6_REDUX(val, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TILE_LOOP_6_REDUX(val, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TILE_LOOP_7_REDUX(val, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TILE_LOOP_7_REDUX(val, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TILE_LOOP_8_REDUX(val, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ - } else { \ - LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TILE_LOOP_8_REDUX(val, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, \ + extent_partial, rank) \ } // end New Loop Macros // tagged macros -#define TAGGED_APPLY(tag, func, ...) func(tag, __VA_ARGS__); +#define KOKKOS_IMPL_TAGGED_APPLY(tag, func, ...) func(tag, __VA_ARGS__); // LayoutRight // d = 0 to start -#define TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, d, ...) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ - for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ - TAGGED_APPLY(tag, func, __VA_ARGS__, i0 + m_offset[d]) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ + KOKKOS_IMPL_TAGGED_APPLY(tag, func, __VA_ARGS__, i0 + m_offset[d]) \ } -#define TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, d, ...) \ - for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ - TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i1 + m_offset[d]) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ + KOKKOS_IMPL_TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i1 + m_offset[d]) \ } -#define TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, d, ...) \ - for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ - TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i2 + m_offset[d]) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ + KOKKOS_IMPL_TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i2 + m_offset[d]) \ } -#define TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, d, ...) \ - for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ - TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i3 + m_offset[d]) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ + KOKKOS_IMPL_TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i3 + m_offset[d]) \ } -#define TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, d, ...) \ - for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ - TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i4 + m_offset[d]) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ + KOKKOS_IMPL_TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i4 + m_offset[d]) \ } -#define TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, d, ...) \ - for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ - TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i5 + m_offset[d]) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ + KOKKOS_IMPL_TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i5 + m_offset[d]) \ } -#define TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, d, ...) \ - for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ - TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i6 + m_offset[d]) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ + KOKKOS_IMPL_TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i6 + m_offset[d]) \ } -#define TAGGED_LOOP_R_8(tag, func, type, m_offset, extent, d, ...) \ - for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ - TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ - i7 + m_offset[d]) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_8(tag, func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ + KOKKOS_IMPL_TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i7 + m_offset[d]) \ } // LayoutLeft // d = rank-1 to start -#define TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, d, ...) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ - for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ - TAGGED_APPLY(tag, func, i0 + m_offset[d], __VA_ARGS__) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ + KOKKOS_IMPL_TAGGED_APPLY(tag, func, i0 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, d, ...) \ - for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ - TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, d - 1, \ - i1 + m_offset[d], __VA_ARGS__) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ + KOKKOS_IMPL_TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, d - 1, \ + i1 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, d, ...) \ - for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ - TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, d - 1, \ - i2 + m_offset[d], __VA_ARGS__) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ + KOKKOS_IMPL_TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, d - 1, \ + i2 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, d, ...) \ - for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ - TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, d - 1, \ - i3 + m_offset[d], __VA_ARGS__) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ + KOKKOS_IMPL_TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, d - 1, \ + i3 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, d, ...) \ - for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ - TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, d - 1, \ - i4 + m_offset[d], __VA_ARGS__) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ + KOKKOS_IMPL_TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, d - 1, \ + i4 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, d, ...) \ - for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ - TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, d - 1, \ - i5 + m_offset[d], __VA_ARGS__) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ + KOKKOS_IMPL_TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, d - 1, \ + i5 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, d, ...) \ - for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ - TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, d - 1, \ - i6 + m_offset[d], __VA_ARGS__) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ + KOKKOS_IMPL_TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, d - 1, \ + i6 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_8(tag, func, type, m_offset, extent, d, ...) \ - for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ - TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, d - 1, \ - i7 + m_offset[d], __VA_ARGS__) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_8(tag, func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ + KOKKOS_IMPL_TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, d - 1, \ + i7 + m_offset[d], __VA_ARGS__) \ } // Left vs Right // TODO: rank not necessary to pass through, can hardcode the values -#define TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, extent, rank) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ - for (type i0 = (type)0; i0 < static_cast(extent[0]); ++i0) { \ - TAGGED_APPLY(tag, func, i0 + m_offset[0]) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, \ + extent, rank) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast(extent[0]); ++i0) { \ + KOKKOS_IMPL_TAGGED_APPLY(tag, func, i0 + m_offset[0]) \ } -#define TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i1 = (type)0; i1 < static_cast(extent[rank - 1]); ++i1) { \ - TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, rank - 2, \ - i1 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, rank - 2, \ + i1 + m_offset[rank - 1]) \ } \ } else { \ for (type i1 = (type)0; i1 < static_cast(extent[0]); ++i1) { \ - TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, 1, i1 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, 1, \ + i1 + m_offset[0]) \ } \ } -#define TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i2 = (type)0; i2 < static_cast(extent[rank - 1]); ++i2) { \ - TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, rank - 2, \ - i2 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, rank - 2, \ + i2 + m_offset[rank - 1]) \ } \ } else { \ for (type i2 = (type)0; i2 < static_cast(extent[0]); ++i2) { \ - TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, 1, i2 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, 1, \ + i2 + m_offset[0]) \ } \ } -#define TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i3 = (type)0; i3 < static_cast(extent[rank - 1]); ++i3) { \ - TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, rank - 2, \ - i3 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, rank - 2, \ + i3 + m_offset[rank - 1]) \ } \ } else { \ for (type i3 = (type)0; i3 < static_cast(extent[0]); ++i3) { \ - TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, 1, i3 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, 1, \ + i3 + m_offset[0]) \ } \ } -#define TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i4 = (type)0; i4 < static_cast(extent[rank - 1]); ++i4) { \ - TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, rank - 2, \ - i4 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, rank - 2, \ + i4 + m_offset[rank - 1]) \ } \ } else { \ for (type i4 = (type)0; i4 < static_cast(extent[0]); ++i4) { \ - TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, 1, i4 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, 1, \ + i4 + m_offset[0]) \ } \ } -#define TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i5 = (type)0; i5 < static_cast(extent[rank - 1]); ++i5) { \ - TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, rank - 2, \ - i5 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, rank - 2, \ + i5 + m_offset[rank - 1]) \ } \ } else { \ for (type i5 = (type)0; i5 < static_cast(extent[0]); ++i5) { \ - TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, 1, i5 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, 1, \ + i5 + m_offset[0]) \ } \ } -#define TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i6 = (type)0; i6 < static_cast(extent[rank - 1]); ++i6) { \ - TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, rank - 2, \ - i6 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, rank - 2, \ + i6 + m_offset[rank - 1]) \ } \ } else { \ for (type i6 = (type)0; i6 < static_cast(extent[0]); ++i6) { \ - TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, 1, i6 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, 1, \ + i6 + m_offset[0]) \ } \ } -#define TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, \ + extent, rank) \ if (is_left) { \ for (type i7 = (type)0; i7 < static_cast(extent[rank - 1]); ++i7) { \ - TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, rank - 2, \ - i7 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, rank - 2, \ + i7 + m_offset[rank - 1]) \ } \ } else { \ for (type i7 = (type)0; i7 < static_cast(extent[0]); ++i7) { \ - TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, 1, i7 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, 1, \ + i7 + m_offset[0]) \ } \ } // Partial vs Full Tile -#define TAGGED_TILE_LOOP_1(tag, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, extent_full, \ - rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_1(tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TAGGED_TILE_LOOP_2(tag, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, extent_full, \ - rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_2(tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TAGGED_TILE_LOOP_3(tag, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, extent_full, \ - rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_3(tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TAGGED_TILE_LOOP_4(tag, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, extent_full, \ - rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_4(tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TAGGED_TILE_LOOP_5(tag, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, extent_full, \ - rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_5(tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TAGGED_TILE_LOOP_6(tag, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, extent_full, \ - rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_6(tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TAGGED_TILE_LOOP_7(tag, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, extent_full, \ - rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_7(tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ } -#define TAGGED_TILE_LOOP_8(tag, func, type, is_left, cond, m_offset, \ - extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, extent_full, \ - rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, extent_partial, \ - rank) \ +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_8(tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, \ + rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ } // parallel_reduce, tagged // Reduction version -#define TAGGED_APPLY_REDUX(val, tag, func, ...) func(tag, __VA_ARGS__, val); +#define KOKKOS_IMPL_TAGGED_APPLY_REDUX(val, tag, func, ...) \ + func(tag, __VA_ARGS__, val); // LayoutRight // d = 0 to start -#define TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ - for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ - TAGGED_APPLY_REDUX(val, tag, func, __VA_ARGS__, i0 + m_offset[d]) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ + KOKKOS_IMPL_TAGGED_APPLY_REDUX(val, tag, func, __VA_ARGS__, \ + i0 + m_offset[d]) \ } -#define TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ - TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ - __VA_ARGS__, i1 + m_offset[d]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, extent, \ + d + 1, __VA_ARGS__, i1 + m_offset[d]) \ } -#define TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ - TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ - __VA_ARGS__, i2 + m_offset[d]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, extent, \ + d + 1, __VA_ARGS__, i2 + m_offset[d]) \ } -#define TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ - TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ - __VA_ARGS__, i3 + m_offset[d]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, extent, \ + d + 1, __VA_ARGS__, i3 + m_offset[d]) \ } -#define TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ - TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ - __VA_ARGS__, i4 + m_offset[d]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, extent, \ + d + 1, __VA_ARGS__, i4 + m_offset[d]) \ } -#define TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ - TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ - __VA_ARGS__, i5 + m_offset[d]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, extent, \ + d + 1, __VA_ARGS__, i5 + m_offset[d]) \ } -#define TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ - TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ - __VA_ARGS__, i6 + m_offset[d]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, extent, \ + d + 1, __VA_ARGS__, i6 + m_offset[d]) \ } -#define TAGGED_LOOP_R_8_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_R_8_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ - TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ - __VA_ARGS__, i7 + m_offset[d]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, extent, \ + d + 1, __VA_ARGS__, i7 + m_offset[d]) \ } // LayoutLeft // d = rank-1 to start -#define TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ - for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ - TAGGED_APPLY_REDUX(val, tag, func, i0 + m_offset[d], __VA_ARGS__) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ + KOKKOS_IMPL_TAGGED_APPLY_REDUX(val, tag, func, i0 + m_offset[d], \ + __VA_ARGS__) \ } -#define TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ - TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ - i1 + m_offset[d], __VA_ARGS__) \ + KOKKOS_IMPL_TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, extent, \ + d - 1, i1 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ - TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ - i2 + m_offset[d], __VA_ARGS__) \ + KOKKOS_IMPL_TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, extent, \ + d - 1, i2 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ - TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ - i3 + m_offset[d], __VA_ARGS__) \ + KOKKOS_IMPL_TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, extent, \ + d - 1, i3 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ - TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ - i4 + m_offset[d], __VA_ARGS__) \ + KOKKOS_IMPL_TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, extent, \ + d - 1, i4 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ - TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ - i5 + m_offset[d], __VA_ARGS__) \ + KOKKOS_IMPL_TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, extent, \ + d - 1, i5 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ - TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ - i6 + m_offset[d], __VA_ARGS__) \ + KOKKOS_IMPL_TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, extent, \ + d - 1, i6 + m_offset[d], __VA_ARGS__) \ } -#define TAGGED_LOOP_L_8_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ +#define KOKKOS_IMPL_TAGGED_LOOP_L_8_REDUX(val, tag, func, type, m_offset, \ + extent, d, ...) \ for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ - TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ - i7 + m_offset[d], __VA_ARGS__) \ + KOKKOS_IMPL_TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, extent, \ + d - 1, i7 + m_offset[d], __VA_ARGS__) \ } // Left vs Right -#define TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, m_offset, \ - extent, rank) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ - for (type i0 = (type)0; i0 < static_cast(extent[0]); ++i0) { \ - TAGGED_APPLY_REDUX(val, tag, func, i0 + m_offset[0]) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, \ + m_offset, extent, rank) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast(extent[0]); ++i0) { \ + KOKKOS_IMPL_TAGGED_APPLY_REDUX(val, tag, func, i0 + m_offset[0]) \ } -#define TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, m_offset, \ - extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, \ + m_offset, extent, rank) \ if (is_left) { \ for (type i1 = (type)0; i1 < static_cast(extent[rank - 1]); ++i1) { \ - TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ - i1 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, \ + extent, rank - 2, \ + i1 + m_offset[rank - 1]) \ } \ } else { \ for (type i1 = (type)0; i1 < static_cast(extent[0]); ++i1) { \ - TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, extent, 1, \ - i1 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, \ + extent, 1, i1 + m_offset[0]) \ } \ } -#define TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, m_offset, \ - extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, \ + m_offset, extent, rank) \ if (is_left) { \ for (type i2 = (type)0; i2 < static_cast(extent[rank - 1]); ++i2) { \ - TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ - i2 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, \ + extent, rank - 2, \ + i2 + m_offset[rank - 1]) \ } \ } else { \ for (type i2 = (type)0; i2 < static_cast(extent[0]); ++i2) { \ - TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, extent, 1, \ - i2 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, \ + extent, 1, i2 + m_offset[0]) \ } \ } -#define TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, m_offset, \ - extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, \ + m_offset, extent, rank) \ if (is_left) { \ for (type i3 = (type)0; i3 < static_cast(extent[rank - 1]); ++i3) { \ - TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ - i3 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, \ + extent, rank - 2, \ + i3 + m_offset[rank - 1]) \ } \ } else { \ for (type i3 = (type)0; i3 < static_cast(extent[0]); ++i3) { \ - TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, extent, 1, \ - i3 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, \ + extent, 1, i3 + m_offset[0]) \ } \ } -#define TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, m_offset, \ - extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, \ + m_offset, extent, rank) \ if (is_left) { \ for (type i4 = (type)0; i4 < static_cast(extent[rank - 1]); ++i4) { \ - TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ - i4 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, \ + extent, rank - 2, \ + i4 + m_offset[rank - 1]) \ } \ } else { \ for (type i4 = (type)0; i4 < static_cast(extent[0]); ++i4) { \ - TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, extent, 1, \ - i4 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, \ + extent, 1, i4 + m_offset[0]) \ } \ } -#define TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, m_offset, \ - extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, \ + m_offset, extent, rank) \ if (is_left) { \ for (type i5 = (type)0; i5 < static_cast(extent[rank - 1]); ++i5) { \ - TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ - i5 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, \ + extent, rank - 2, \ + i5 + m_offset[rank - 1]) \ } \ } else { \ for (type i5 = (type)0; i5 < static_cast(extent[0]); ++i5) { \ - TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, extent, 1, \ - i5 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, \ + extent, 1, i5 + m_offset[0]) \ } \ } -#define TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, m_offset, \ - extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, \ + m_offset, extent, rank) \ if (is_left) { \ for (type i6 = (type)0; i6 < static_cast(extent[rank - 1]); ++i6) { \ - TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ - i6 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, \ + extent, rank - 2, \ + i6 + m_offset[rank - 1]) \ } \ } else { \ for (type i6 = (type)0; i6 < static_cast(extent[0]); ++i6) { \ - TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, extent, 1, \ - i6 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, \ + extent, 1, i6 + m_offset[0]) \ } \ } -#define TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, m_offset, \ - extent, rank) \ +#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, \ + m_offset, extent, rank) \ if (is_left) { \ for (type i7 = (type)0; i7 < static_cast(extent[rank - 1]); ++i7) { \ - TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ - i7 + m_offset[rank - 1]) \ + KOKKOS_IMPL_TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, \ + extent, rank - 2, \ + i7 + m_offset[rank - 1]) \ } \ } else { \ for (type i7 = (type)0; i7 < static_cast(extent[0]); ++i7) { \ - TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, extent, 1, \ - i7 + m_offset[0]) \ + KOKKOS_IMPL_TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, \ + extent, 1, i7 + m_offset[0]) \ } \ } // Partial vs Full Tile -#define TAGGED_TILE_LOOP_1_REDUX(val, tag, func, type, is_left, cond, \ - m_offset, extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_full, rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_partial, rank) \ - } - -#define TAGGED_TILE_LOOP_2_REDUX(val, tag, func, type, is_left, cond, \ - m_offset, extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_full, rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_partial, rank) \ - } - -#define TAGGED_TILE_LOOP_3_REDUX(val, tag, func, type, is_left, cond, \ - m_offset, extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_full, rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_partial, rank) \ - } - -#define TAGGED_TILE_LOOP_4_REDUX(val, tag, func, type, is_left, cond, \ - m_offset, extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_full, rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_partial, rank) \ - } - -#define TAGGED_TILE_LOOP_5_REDUX(val, tag, func, type, is_left, cond, \ - m_offset, extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_full, rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_partial, rank) \ - } - -#define TAGGED_TILE_LOOP_6_REDUX(val, tag, func, type, is_left, cond, \ - m_offset, extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_full, rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_partial, rank) \ - } - -#define TAGGED_TILE_LOOP_7_REDUX(val, tag, func, type, is_left, cond, \ - m_offset, extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_full, rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_partial, rank) \ - } - -#define TAGGED_TILE_LOOP_8_REDUX(val, tag, func, type, is_left, cond, \ - m_offset, extent_full, extent_partial, rank) \ - if (cond) { \ - TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_full, rank) \ - } else { \ - TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, m_offset, \ - extent_partial, rank) \ +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_1_REDUX(val, tag, func, type, is_left, \ + cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_partial, rank) \ + } + +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_2_REDUX(val, tag, func, type, is_left, \ + cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_partial, rank) \ + } + +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_3_REDUX(val, tag, func, type, is_left, \ + cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_partial, rank) \ + } + +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_4_REDUX(val, tag, func, type, is_left, \ + cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_partial, rank) \ + } + +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_5_REDUX(val, tag, func, type, is_left, \ + cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_partial, rank) \ + } + +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_6_REDUX(val, tag, func, type, is_left, \ + cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_partial, rank) \ + } + +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_7_REDUX(val, tag, func, type, is_left, \ + cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_partial, rank) \ + } + +#define KOKKOS_IMPL_TAGGED_TILE_LOOP_8_REDUX(val, tag, func, type, is_left, \ + cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_full, rank) \ + } else { \ + KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, \ + m_offset, extent_partial, rank) \ } // end tagged macros @@ -1212,14 +1323,15 @@ struct Tile_Loop_Type<1, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_1(func, IType, IsLeft, cond, offset, a, b, 1); + KOKKOS_IMPL_TILE_LOOP_1(func, IType, IsLeft, cond, offset, a, b, 1); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_1_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 1); + KOKKOS_IMPL_TILE_LOOP_1_REDUX(value, func, IType, IsLeft, cond, offset, a, + b, 1); } }; @@ -1228,14 +1340,15 @@ struct Tile_Loop_Type<2, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_2(func, IType, IsLeft, cond, offset, a, b, 2); + KOKKOS_IMPL_TILE_LOOP_2(func, IType, IsLeft, cond, offset, a, b, 2); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_2_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 2); + KOKKOS_IMPL_TILE_LOOP_2_REDUX(value, func, IType, IsLeft, cond, offset, a, + b, 2); } }; @@ -1244,14 +1357,15 @@ struct Tile_Loop_Type<3, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_3(func, IType, IsLeft, cond, offset, a, b, 3); + KOKKOS_IMPL_TILE_LOOP_3(func, IType, IsLeft, cond, offset, a, b, 3); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_3_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 3); + KOKKOS_IMPL_TILE_LOOP_3_REDUX(value, func, IType, IsLeft, cond, offset, a, + b, 3); } }; @@ -1260,14 +1374,15 @@ struct Tile_Loop_Type<4, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_4(func, IType, IsLeft, cond, offset, a, b, 4); + KOKKOS_IMPL_TILE_LOOP_4(func, IType, IsLeft, cond, offset, a, b, 4); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_4_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 4); + KOKKOS_IMPL_TILE_LOOP_4_REDUX(value, func, IType, IsLeft, cond, offset, a, + b, 4); } }; @@ -1276,14 +1391,15 @@ struct Tile_Loop_Type<5, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_5(func, IType, IsLeft, cond, offset, a, b, 5); + KOKKOS_IMPL_TILE_LOOP_5(func, IType, IsLeft, cond, offset, a, b, 5); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_5_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 5); + KOKKOS_IMPL_TILE_LOOP_5_REDUX(value, func, IType, IsLeft, cond, offset, a, + b, 5); } }; @@ -1292,14 +1408,15 @@ struct Tile_Loop_Type<6, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_6(func, IType, IsLeft, cond, offset, a, b, 6); + KOKKOS_IMPL_TILE_LOOP_6(func, IType, IsLeft, cond, offset, a, b, 6); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_6_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 6); + KOKKOS_IMPL_TILE_LOOP_6_REDUX(value, func, IType, IsLeft, cond, offset, a, + b, 6); } }; @@ -1308,14 +1425,15 @@ struct Tile_Loop_Type<7, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_7(func, IType, IsLeft, cond, offset, a, b, 7); + KOKKOS_IMPL_TILE_LOOP_7(func, IType, IsLeft, cond, offset, a, b, 7); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_7_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 7); + KOKKOS_IMPL_TILE_LOOP_7_REDUX(value, func, IType, IsLeft, cond, offset, a, + b, 7); } }; @@ -1324,14 +1442,15 @@ struct Tile_Loop_Type<8, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_8(func, IType, IsLeft, cond, offset, a, b, 8); + KOKKOS_IMPL_TILE_LOOP_8(func, IType, IsLeft, cond, offset, a, b, 8); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TILE_LOOP_8_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 8); + KOKKOS_IMPL_TILE_LOOP_8_REDUX(value, func, IType, IsLeft, cond, offset, a, + b, 8); } }; @@ -1343,15 +1462,16 @@ struct Tile_Loop_Type<1, IsLeft, IType, Tagged, template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_1(Tagged(), func, IType, IsLeft, cond, offset, a, b, 1); + KOKKOS_IMPL_TAGGED_TILE_LOOP_1(Tagged(), func, IType, IsLeft, cond, offset, + a, b, 1); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_1_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, - a, b, 1); + KOKKOS_IMPL_TAGGED_TILE_LOOP_1_REDUX(value, Tagged(), func, IType, IsLeft, + cond, offset, a, b, 1); } }; @@ -1361,15 +1481,16 @@ struct Tile_Loop_Type<2, IsLeft, IType, Tagged, template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_2(Tagged(), func, IType, IsLeft, cond, offset, a, b, 2); + KOKKOS_IMPL_TAGGED_TILE_LOOP_2(Tagged(), func, IType, IsLeft, cond, offset, + a, b, 2); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_2_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, - a, b, 2); + KOKKOS_IMPL_TAGGED_TILE_LOOP_2_REDUX(value, Tagged(), func, IType, IsLeft, + cond, offset, a, b, 2); } }; @@ -1379,15 +1500,16 @@ struct Tile_Loop_Type<3, IsLeft, IType, Tagged, template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_3(Tagged(), func, IType, IsLeft, cond, offset, a, b, 3); + KOKKOS_IMPL_TAGGED_TILE_LOOP_3(Tagged(), func, IType, IsLeft, cond, offset, + a, b, 3); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_3_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, - a, b, 3); + KOKKOS_IMPL_TAGGED_TILE_LOOP_3_REDUX(value, Tagged(), func, IType, IsLeft, + cond, offset, a, b, 3); } }; @@ -1397,15 +1519,16 @@ struct Tile_Loop_Type<4, IsLeft, IType, Tagged, template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_4(Tagged(), func, IType, IsLeft, cond, offset, a, b, 4); + KOKKOS_IMPL_TAGGED_TILE_LOOP_4(Tagged(), func, IType, IsLeft, cond, offset, + a, b, 4); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_4_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, - a, b, 4); + KOKKOS_IMPL_TAGGED_TILE_LOOP_4_REDUX(value, Tagged(), func, IType, IsLeft, + cond, offset, a, b, 4); } }; @@ -1415,15 +1538,16 @@ struct Tile_Loop_Type<5, IsLeft, IType, Tagged, template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_5(Tagged(), func, IType, IsLeft, cond, offset, a, b, 5); + KOKKOS_IMPL_TAGGED_TILE_LOOP_5(Tagged(), func, IType, IsLeft, cond, offset, + a, b, 5); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_5_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, - a, b, 5); + KOKKOS_IMPL_TAGGED_TILE_LOOP_5_REDUX(value, Tagged(), func, IType, IsLeft, + cond, offset, a, b, 5); } }; @@ -1433,15 +1557,16 @@ struct Tile_Loop_Type<6, IsLeft, IType, Tagged, template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_6(Tagged(), func, IType, IsLeft, cond, offset, a, b, 6); + KOKKOS_IMPL_TAGGED_TILE_LOOP_6(Tagged(), func, IType, IsLeft, cond, offset, + a, b, 6); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_6_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, - a, b, 6); + KOKKOS_IMPL_TAGGED_TILE_LOOP_6_REDUX(value, Tagged(), func, IType, IsLeft, + cond, offset, a, b, 6); } }; @@ -1451,15 +1576,16 @@ struct Tile_Loop_Type<7, IsLeft, IType, Tagged, template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_7(Tagged(), func, IType, IsLeft, cond, offset, a, b, 7); + KOKKOS_IMPL_TAGGED_TILE_LOOP_7(Tagged(), func, IType, IsLeft, cond, offset, + a, b, 7); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_7_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, - a, b, 7); + KOKKOS_IMPL_TAGGED_TILE_LOOP_7_REDUX(value, Tagged(), func, IType, IsLeft, + cond, offset, a, b, 7); } }; @@ -1469,15 +1595,16 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_8(Tagged(), func, IType, IsLeft, cond, offset, a, b, 8); + KOKKOS_IMPL_TAGGED_TILE_LOOP_8(Tagged(), func, IType, IsLeft, cond, offset, + a, b, 8); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { - TAGGED_TILE_LOOP_8_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, - a, b, 8); + KOKKOS_IMPL_TAGGED_TILE_LOOP_8_REDUX(value, Tagged(), func, IType, IsLeft, + cond, offset, a, b, 8); } }; // end Structs for calling loops @@ -1589,19 +1716,19 @@ struct HostIterateTile Date: Wed, 1 Feb 2023 09:39:06 -0800 Subject: [PATCH 142/496] Refactor OpenMPTarget backend (#5726) * OpenMPTarget: Refactored the current code into individual files that implement individual constructs. * OpenMPTarget: Block unit tests that fail with llvm/16. * OpenMPTarget: Adding the ParallelScan for hierarchical parallelism in a different file. * OpenMPTarget: Add ParallelScan for TeamPolicy in a separate file. * OpenMPTarget: Edits to include file names where needed. * OpenMPTarget: Removed the diff_files file. * OpenMPTarget: Rolled back the changes in the CMakeList for unit test. * OpenMPTarget: diff_files reverted as of the develop branch. Moved Kokkos_OpenMPTargetSpace file into OpenMPTarget dir. * OpenMPTarget: Fixing diff_files. * OpenMPTarget: Delete empty lines at the top before license information. * OpenMPTarget: Adding the OpenMPTargetSpace file which was removed from the dir above. * OpenMPTarget: Moved the Kokkos_OpenMPTarget.hpp file inside OpenMPTarget dir. --------- Co-authored-by: Rahulkumar Gayatri Co-authored-by: Daniel Arndt --- .../Kokkos_OpenMPTarget.hpp | 3 +- .../OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp | 4 +- .../Kokkos_OpenMPTargetSpace.hpp | 0 .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp | 1929 ----------------- .../Kokkos_OpenMPTarget_Instance.cpp | 2 +- .../Kokkos_OpenMPTarget_Parallel.hpp | 1746 +++++---------- .../Kokkos_OpenMPTarget_ParallelFor_Range.hpp | 72 + .../Kokkos_OpenMPTarget_ParallelFor_Team.hpp | 170 ++ ...kkos_OpenMPTarget_ParallelReduce_Range.hpp | 133 ++ ...okkos_OpenMPTarget_ParallelReduce_Team.hpp | 551 +++++ ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 252 +++ .../Kokkos_OpenMPTarget_ParallelScan_Team.hpp | 129 ++ .../Kokkos_OpenMPTarget_Parallel_Common.hpp | 675 ++++++ .../Kokkos_OpenMPTarget_Parallel_MDRange.hpp | 25 +- .../Kokkos_OpenMPTarget_Reducer.hpp | 694 ++++++ .../Kokkos_OpenMPTarget_UniqueToken.hpp | 2 +- core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp | 11 +- 17 files changed, 3293 insertions(+), 3105 deletions(-) rename core/src/{ => OpenMPTarget}/Kokkos_OpenMPTarget.hpp (98%) rename core/src/{ => OpenMPTarget}/Kokkos_OpenMPTargetSpace.hpp (100%) delete mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp diff --git a/core/src/Kokkos_OpenMPTarget.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp similarity index 98% rename from core/src/Kokkos_OpenMPTarget.hpp rename to core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp index 4bcfed90e3..adf972dd08 100644 --- a/core/src/Kokkos_OpenMPTarget.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp @@ -30,7 +30,7 @@ static_assert(false, #include #include -#include +#include #include #include #include @@ -141,7 +141,6 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ -#include #include #include #include diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp index f30abb0c87..de8e629831 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp @@ -33,8 +33,8 @@ #include #include -#include -#include +#include +#include #include #include #include diff --git a/core/src/Kokkos_OpenMPTargetSpace.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp similarity index 100% rename from core/src/Kokkos_OpenMPTargetSpace.hpp rename to core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp deleted file mode 100644 index 6d62a3c7e4..0000000000 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp +++ /dev/null @@ -1,1929 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_OPENMPTARGETEXEC_HPP -#define KOKKOS_OPENMPTARGETEXEC_HPP - -#include -#include - -#include -#include "Kokkos_OpenMPTarget_Abort.hpp" - -// FIXME_OPENMPTARGET - Using this macro to implement a workaround for -// hierarchical reducers. It avoids hitting the code path which we wanted to -// write but doesn't work. undef'ed at the end. -// Intel compilers prefer the non-workaround version. -#ifndef KOKKOS_ARCH_INTEL_GPU -#define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND -#endif - -// FIXME_OPENMPTARGET - Using this macro to implement a workaround for -// hierarchical scan. It avoids hitting the code path which we wanted to -// write but doesn't work. undef'ed at the end. -#ifndef KOKKOS_ARCH_INTEL_GPU -#define KOKKOS_IMPL_TEAM_SCAN_WORKAROUND -#endif - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct OpenMPTargetReducerWrapper { - using value_type = typename Reducer::value_type; - - // Using a generic unknown Reducer for the OpenMPTarget backend is not - // implemented. - KOKKOS_INLINE_FUNCTION - static void join(value_type&, const value_type&) = delete; - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type&, const volatile value_type&) = delete; - - KOKKOS_INLINE_FUNCTION - static void init(value_type&) = delete; -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { dest += src; } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest += src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::sum(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { dest *= src; } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest *= src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::prod(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src < dest) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src < dest) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::min(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src > dest) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src > dest) dest = src; - } - - // Required - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::max(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest = dest && src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest && src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::land(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - using result_view_type = Kokkos::View; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest = dest || src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest || src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::lor(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest = dest & src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest & src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::band(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest = dest | src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest | src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::bor(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = ValLocScalar; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.val < dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.val = reduction_identity::min(); - val.loc = reduction_identity::min(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = ValLocScalar; - - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.val > dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val > dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.val = reduction_identity::max(); - val.loc = reduction_identity::min(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - - public: - // Required - using value_type = MinMaxScalar; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_val = reduction_identity::max(); - val.min_val = reduction_identity::min(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = MinMaxLocScalar; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_val = reduction_identity::max(); - val.min_val = reduction_identity::min(); - val.max_loc = reduction_identity::min(); - val.min_loc = reduction_identity::min(); - } -}; - -// -// specialize for MaxFirstLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = ValLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (dest.val < src.val) { - dest = src; - } else if (!(src.val < dest.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (dest.val < src.val) { - dest = src; - } else if (!(src.val < dest.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.val = reduction_identity::max(); - val.loc = reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for MinFirstLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = ValLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.val < dest.val) { - dest = src; - } else if (!(dest.val < src.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) { - dest = src; - } else if (!(dest.val < src.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.val = reduction_identity::min(); - val.loc = reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for MinMaxFirstLastLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = MinMaxLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } else if (!(dest.min_val < src.min_val)) { - dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; - } - - if (dest.max_val < src.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } else if (!(src.max_val < dest.max_val)) { - dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } else if (!(dest.min_val < src.min_val)) { - dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; - } - - if (dest.max_val < src.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } else if (!(src.max_val < dest.max_val)) { - dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_val = reduction_identity::max(); - val.min_val = reduction_identity::min(); - val.max_loc = reduction_identity::max(); - val.min_loc = reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for FirstLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = FirstLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) - ? src.min_loc_true - : dest.min_loc_true; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) - ? src.min_loc_true - : dest.min_loc_true; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.min_loc_true = reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for LastLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = LastLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_loc_true = reduction_identity::max(); - } -#pragma omp end declare target -}; - -// -// specialize for StdIsPartitioned -// -template -struct OpenMPTargetReducerWrapper> { - private: - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = StdIsPartScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_loc_true = ::Kokkos::reduction_identity::max(); - val.min_loc_false = ::Kokkos::reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for StdPartitionPoint -// -template -struct OpenMPTargetReducerWrapper> { - private: - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = StdPartPointScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.min_loc_false = ::Kokkos::reduction_identity::min(); - } -#pragma omp end declare target -}; - -/* -template -class OpenMPTargetReducerWrapper { - public: - const ReducerType& reducer; - using value_type = typename ReducerType::value_type; - value_type& value; - - KOKKOS_INLINE_FUNCTION - void join(const value_type& upd) { - reducer.join(value,upd); - } - - KOKKOS_INLINE_FUNCTION - void init(const value_type& upd) { - reducer.init(value,upd); - } -};*/ - -} // namespace Impl -} // namespace Kokkos - -namespace Kokkos { -namespace Impl { - -//---------------------------------------------------------------------------- -/** \brief Data for OpenMPTarget thread execution */ - -class OpenMPTargetExec { - public: - // FIXME_OPENMPTARGET - Currently the maximum number of - // teams possible is calculated based on NVIDIA's Volta GPU. In - // future this value should be based on the chosen architecture for the - // OpenMPTarget backend. - static constexpr int MAX_ACTIVE_THREADS = 2080 * 80; - static constexpr int MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS / 32; - - private: - static void* scratch_ptr; - - public: - static void verify_is_process(const char* const); - static void verify_initialized(const char* const); - - static int* get_lock_array(int num_teams); - static void* get_scratch_ptr(); - static void clear_scratch(); - static void clear_lock_array(); - static void resize_scratch(int64_t team_reduce_bytes, - int64_t team_shared_bytes, - int64_t thread_local_bytes, int64_t league_size); - - static void* m_scratch_ptr; - static int64_t m_scratch_size; - static int* m_lock_array; - static int64_t m_lock_size; - static uint32_t* m_uniquetoken_ptr; -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -class OpenMPTargetExecTeamMember { - public: - static constexpr int TEAM_REDUCE_SIZE = 512; - - using execution_space = Kokkos::Experimental::OpenMPTarget; - using scratch_memory_space = execution_space::scratch_memory_space; - using team_handle = OpenMPTargetExecTeamMember; - - scratch_memory_space m_team_shared; - size_t m_team_scratch_size[2]; - int m_team_rank; - int m_team_size; - int m_league_rank; - int m_league_size; - int m_vector_length; - int m_vector_lane; - int m_shmem_block_index; - void* m_glb_scratch; - void* m_reduce_scratch; - - public: - KOKKOS_INLINE_FUNCTION - const execution_space::scratch_memory_space& team_shmem() const { - return m_team_shared.set_team_thread_mode(0, 1, 0); - } - - // set_team_thread_mode routine parameters for future understanding: - // first parameter - scratch level. - // second parameter - size multiplier for advancing scratch ptr after a - // request was serviced. third parameter - offset size multiplier from current - // scratch ptr when returning a ptr for a request. - KOKKOS_INLINE_FUNCTION - const execution_space::scratch_memory_space& team_scratch(int level) const { - return m_team_shared.set_team_thread_mode(level, 1, 0); - } - - KOKKOS_INLINE_FUNCTION - const execution_space::scratch_memory_space& thread_scratch(int level) const { - return m_team_shared.set_team_thread_mode(level, team_size(), team_rank()); - } - - KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; } - KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; } - KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; } - KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; } - KOKKOS_INLINE_FUNCTION void* impl_reduce_scratch() const { - return m_reduce_scratch; - } - - KOKKOS_INLINE_FUNCTION void team_barrier() const { -#pragma omp barrier - } - - template - KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value, - int thread_id) const { - // Make sure there is enough scratch space: - using type = std::conditional_t<(sizeof(ValueType) < TEAM_REDUCE_SIZE), - ValueType, void>; - type* team_scratch = - reinterpret_cast(static_cast(m_glb_scratch) + - TEAM_REDUCE_SIZE * omp_get_team_num()); -#pragma omp barrier - if (team_rank() == thread_id) *team_scratch = value; -#pragma omp barrier - value = *team_scratch; - } - - template - KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure& f, ValueType& value, - const int& thread_id) const { - f(value); - team_broadcast(value, thread_id); - } - - // FIXME_OPENMPTARGET this function has the wrong interface and currently - // ignores the reducer passed. - template - KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value, - const JoinOp&) const { -#pragma omp barrier - - using value_type = ValueType; - // const JoinLambdaAdapter op(op_in); - - // Make sure there is enough scratch space: - using type = std::conditional_t<(sizeof(value_type) < TEAM_REDUCE_SIZE), - value_type, void>; - - const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type); - type* team_scratch = - reinterpret_cast(static_cast(m_glb_scratch) + - TEAM_REDUCE_SIZE * omp_get_team_num()); - for (int i = m_team_rank; i < n_values; i += m_team_size) { - team_scratch[i] = value_type(); - } - -#pragma omp barrier - - for (int k = 0; k < m_team_size; k += n_values) { - if ((k <= m_team_rank) && (k + n_values > m_team_rank)) - team_scratch[m_team_rank % n_values] += value; -#pragma omp barrier - } - - for (int d = 1; d < n_values; d *= 2) { - if ((m_team_rank + d < n_values) && (m_team_rank % (2 * d) == 0)) { - team_scratch[m_team_rank] += team_scratch[m_team_rank + d]; - } -#pragma omp barrier - } - return team_scratch[0]; - } - /** \brief Intra-team exclusive prefix sum with team_rank() ordering - * with intra-team non-deterministic ordering accumulation. - * - * The global inter-team accumulation value will, at the end of the - * league's parallel execution, be the scan's total. - * Parallel execution ordering of the league's teams is non-deterministic. - * As such the base value for each team's scan operation is similarly - * non-deterministic. - */ - template - KOKKOS_INLINE_FUNCTION ArgType - team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { - // FIXME_OPENMPTARGET - /* // Make sure there is enough scratch space: - using type = - std::conditional_t<(sizeof(ArgType) < TEAM_REDUCE_SIZE), ArgType, void>; - - volatile type * const work_value = ((type*) m_exec.scratch_thread()); - - *work_value = value ; - - memory_fence(); - - if ( team_fan_in() ) { - // The last thread to synchronize returns true, all other threads wait - for team_fan_out() - // m_team_base[0] == highest ranking team member - // m_team_base[ m_team_size - 1 ] == lowest ranking team member - // - // 1) copy from lower to higher rank, initialize lowest rank to zero - // 2) prefix sum from lowest to highest rank, skipping lowest rank - - type accum = 0 ; - - if ( global_accum ) { - for ( int i = m_team_size ; i-- ; ) { - type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i - )->scratch_thread()); accum += val ; - } - accum = atomic_fetch_add( global_accum , accum ); - } - - for ( int i = m_team_size ; i-- ; ) { - type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i - )->scratch_thread()); const type offset = accum ; accum += val ; val = - offset ; - } - - memory_fence(); - } - - team_fan_out(); - - return *work_value ;*/ - return ArgType(); - } - - /** \brief Intra-team exclusive prefix sum with team_rank() ordering. - * - * The highest rank thread can compute the reduction total as - * reduction_total = dev.team_scan( value ) + value ; - */ - template - KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { - return this->template team_scan(value, 0); - } - - //---------------------------------------- - // Private for the driver - - private: - using space = execution_space::scratch_memory_space; - - public: - // FIXME_OPENMPTARGET - 512(16*32) bytes at the begining of the scratch space - // for each league is saved for reduction. It should actually be based on the - // ValueType of the reduction variable. - inline OpenMPTargetExecTeamMember( - const int league_rank, const int league_size, const int team_size, - const int vector_length // const TeamPolicyInternal< OpenMPTarget, - // Properties ...> & team - , - void* const glb_scratch, const int shmem_block_index, - const size_t shmem_size_L0, const size_t shmem_size_L1) - : m_team_scratch_size{shmem_size_L0, shmem_size_L1}, - m_team_rank(0), - m_team_size(team_size), - m_league_rank(league_rank), - m_league_size(league_size), - m_vector_length(vector_length), - m_shmem_block_index(shmem_block_index), - m_glb_scratch(glb_scratch) { - const int omp_tid = omp_get_thread_num(); - - // The scratch memory allocated is a sum of TEAM_REDUCE_SIZE, L0 shmem size - // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for - // hierarchical reduction. There is an additional 10% of the requested - // scratch memory allocated per team as padding. Hence the product with 0.1. - const int reduce_offset = - m_shmem_block_index * - (shmem_size_L0 + shmem_size_L1 + - ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE); - const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE; - const int l1_offset = l0_offset + shmem_size_L0; - m_team_shared = scratch_memory_space( - (static_cast(glb_scratch) + l0_offset), shmem_size_L0, - static_cast(glb_scratch) + l1_offset, shmem_size_L1); - m_reduce_scratch = static_cast(glb_scratch) + reduce_offset; - m_league_rank = league_rank; - m_team_rank = omp_tid; - m_vector_lane = 0; - } - - static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; } -}; - -template -class TeamPolicyInternal - : public PolicyTraits { - public: - //! Tag this class as a kokkos execution policy - using execution_policy = TeamPolicyInternal; - - using traits = PolicyTraits; - - //---------------------------------------- - - template - inline static int team_size_max(const FunctorType&, const ParallelForTag&) { - return 256; - } - - template - inline static int team_size_max(const FunctorType&, - const ParallelReduceTag&) { - return 256; - } - - template - inline static int team_size_max(const FunctorType&, const ReducerType&, - const ParallelReduceTag&) { - return 256; - } - - template - inline static int team_size_recommended(const FunctorType&, - const ParallelForTag&) { - return 128; - } - - template - inline static int team_size_recommended(const FunctorType&, - const ParallelReduceTag&) { - return 128; - } - - template - inline static int team_size_recommended(const FunctorType&, - const ReducerType&, - const ParallelReduceTag&) { - return 128; - } - - //---------------------------------------- - - private: - int m_league_size; - int m_team_size; - int m_vector_length; - int m_team_alloc; - int m_team_iter; - std::array m_team_scratch_size; - std::array m_thread_scratch_size; - bool m_tune_team_size; - bool m_tune_vector_length; - constexpr const static size_t default_team_size = 256; - int m_chunk_size; - - inline void init(const int league_size_request, const int team_size_request, - const int vector_length_request) { - m_league_size = league_size_request; - - // Minimum team size should be 32 for OpenMPTarget backend. - if (team_size_request < 32) { - Kokkos::Impl::OpenMPTarget_abort( - "OpenMPTarget backend requires a minimum of 32 threads per team.\n"); - } else - m_team_size = team_size_request; - - m_vector_length = vector_length_request; - set_auto_chunk_size(); - } - - template - friend class TeamPolicyInternal; - - public: - // FIXME_OPENMPTARGET : Currently this routine is a copy of the Cuda - // implementation, but this has to be tailored to be architecture specific. - inline static int scratch_size_max(int level) { - return ( - level == 0 ? 1024 * 40 : // 48kB is the max for CUDA, but we need some - // for team_member.reduce etc. - 20 * 1024 * - 1024); // arbitrarily setting this to 20MB, for a Volta V100 - // that would give us about 3.2GB for 2 teams per SM - } - inline bool impl_auto_team_size() const { return m_tune_team_size; } - inline bool impl_auto_vector_length() const { return m_tune_vector_length; } - inline void impl_set_team_size(const size_t size) { m_team_size = size; } - inline void impl_set_vector_length(const size_t length) { - m_tune_vector_length = length; - } - inline int impl_vector_length() const { return m_vector_length; } - inline int team_size() const { return m_team_size; } - inline int league_size() const { return m_league_size; } - inline size_t scratch_size(const int& level, int team_size_ = -1) const { - if (team_size_ < 0) team_size_ = m_team_size; - return m_team_scratch_size[level] + - team_size_ * m_thread_scratch_size[level]; - } - - inline Kokkos::Experimental::OpenMPTarget space() const { - return Kokkos::Experimental::OpenMPTarget(); - } - - template - TeamPolicyInternal(const TeamPolicyInternal& p) - : m_league_size(p.m_league_size), - m_team_size(p.m_team_size), - m_vector_length(p.m_vector_length), - m_team_alloc(p.m_team_alloc), - m_team_iter(p.m_team_iter), - m_team_scratch_size(p.m_team_scratch_size), - m_thread_scratch_size(p.m_thread_scratch_size), - m_tune_team_size(p.m_tune_team_size), - m_tune_vector_length(p.m_tune_vector_length), - m_chunk_size(p.m_chunk_size) {} - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const typename traits::execution_space&, - int league_size_request, int team_size_request, - int vector_length_request = 1) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(false), - m_tune_vector_length(false), - m_chunk_size(0) { - init(league_size_request, team_size_request, vector_length_request); - } - - TeamPolicyInternal(const typename traits::execution_space&, - int league_size_request, - const Kokkos::AUTO_t& /* team_size_request */ - , - int vector_length_request = 1) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(true), - m_tune_vector_length(false), - m_chunk_size(0) { - init(league_size_request, default_team_size / vector_length_request, - vector_length_request); - } - - TeamPolicyInternal(const typename traits::execution_space&, - int league_size_request, - const Kokkos::AUTO_t& /* team_size_request */ - , - const Kokkos::AUTO_t& /* vector_length_request */) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(true), - m_tune_vector_length(true), - m_chunk_size(0) { - init(league_size_request, default_team_size, 1); - } - TeamPolicyInternal(const typename traits::execution_space&, - int league_size_request, int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(false), - m_tune_vector_length(true), - m_chunk_size(0) { - init(league_size_request, team_size_request, 1); - } - - TeamPolicyInternal(int league_size_request, int team_size_request, - int vector_length_request = 1) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(false), - m_tune_vector_length(false), - m_chunk_size(0) { - init(league_size_request, team_size_request, vector_length_request); - } - - TeamPolicyInternal(int league_size_request, - const Kokkos::AUTO_t& /* team_size_request */ - , - int vector_length_request = 1) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(true), - m_tune_vector_length(false), - m_chunk_size(0) { - init(league_size_request, default_team_size / vector_length_request, - vector_length_request); - } - - TeamPolicyInternal(int league_size_request, - const Kokkos::AUTO_t& /* team_size_request */ - , - const Kokkos::AUTO_t& /* vector_length_request */) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(true), - m_tune_vector_length(true), - m_chunk_size(0) { - init(league_size_request, default_team_size, 1); - } - TeamPolicyInternal(int league_size_request, int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(false), - m_tune_vector_length(true), - m_chunk_size(0) { - init(league_size_request, team_size_request, 1); - } - inline static size_t vector_length_max() { - return 32; /* TODO: this is bad. Need logic that is compiler and backend - aware */ - } - inline int team_alloc() const { return m_team_alloc; } - inline int team_iter() const { return m_team_iter; } - - inline int chunk_size() const { return m_chunk_size; } - - /** \brief set chunk_size to a discrete value*/ - inline TeamPolicyInternal& set_chunk_size( - typename traits::index_type chunk_size_) { - m_chunk_size = chunk_size_; - return *this; - } - - /** \brief set per team scratch size for a specific level of the scratch - * hierarchy */ - inline TeamPolicyInternal& set_scratch_size(const int& level, - const PerTeamValue& per_team) { - m_team_scratch_size[level] = per_team.value; - return *this; - } - - /** \brief set per thread scratch size for a specific level of the scratch - * hierarchy */ - inline TeamPolicyInternal& set_scratch_size( - const int& level, const PerThreadValue& per_thread) { - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - /** \brief set per thread and per team scratch size for a specific level of - * the scratch hierarchy */ - inline TeamPolicyInternal& set_scratch_size( - const int& level, const PerTeamValue& per_team, - const PerThreadValue& per_thread) { - m_team_scratch_size[level] = per_team.value; - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - private: - /** \brief finalize chunk_size if it was set to AUTO*/ - inline void set_auto_chunk_size() { - int concurrency = 2048 * 128; - - if (concurrency == 0) concurrency = 1; - - if (m_chunk_size > 0) { - if (!Impl::is_integral_power_of_two(m_chunk_size)) - Kokkos::abort("TeamPolicy blocking granularity must be power of two"); - } - - int new_chunk_size = 1; - while (new_chunk_size * 100 * concurrency < m_league_size) - new_chunk_size *= 2; - if (new_chunk_size < 128) { - new_chunk_size = 1; - while ((new_chunk_size * 40 * concurrency < m_league_size) && - (new_chunk_size < 128)) - new_chunk_size *= 2; - } - m_chunk_size = new_chunk_size; - } - - public: - using member_type = Impl::OpenMPTargetExecTeamMember; -}; -} // namespace Impl - -} // namespace Kokkos - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember> -TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType& count) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - std::common_type_t, Impl::OpenMPTargetExecTeamMember> -TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType1& begin, const iType2& end) { - using iType = std::common_type_t; - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(begin), - iType(end)); -} - -template -KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember> -ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType& count) { - return Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< - std::common_type_t, Impl::OpenMPTargetExecTeamMember> -ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType1& arg_begin, const iType2& arg_end) { - using iType = std::common_type_t; - return Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin), - iType(arg_end)); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember> -TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType& count) { - return Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< - std::common_type_t, Impl::OpenMPTargetExecTeamMember> -TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType1& arg_begin, const iType2& arg_end) { - using iType = std::common_type_t; - return Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin), - iType(arg_end)); -} - -KOKKOS_INLINE_FUNCTION -Impl::ThreadSingleStruct PerTeam( - const Impl::OpenMPTargetExecTeamMember& thread) { - return Impl::ThreadSingleStruct(thread); -} - -KOKKOS_INLINE_FUNCTION -Impl::VectorSingleStruct PerThread( - const Impl::OpenMPTargetExecTeamMember& thread) { - return Impl::VectorSingleStruct(thread); -} -} // namespace Kokkos - -namespace Kokkos { - -/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all threads of the the calling thread team. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda) { -#pragma omp for nowait schedule(static, 1) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); -} - -/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all threads of the the calling thread team - * and a summation of val is performed and put into result. - */ - -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ValueType& result) { - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamThread_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - -#pragma omp barrier - TeamThread_scratch[0] = ValueType(); -#pragma omp barrier - - if constexpr (std::is_arithmetic::value) { -#pragma omp for reduction(+ : TeamThread_scratch[:1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - TeamThread_scratch[0] += tmp; - } - } else { -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) - -#pragma omp for reduction(custom : TeamThread_scratch[:1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - TeamThread_scratch[0] += tmp; - } - } - - result = TeamThread_scratch[0]; -} - -#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND) -// For some reason the actual version we wanted to write doesn't work -// and crashes. We should try this with every new compiler -// This is the variant we actually wanted to write -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType result) { - using ValueType = typename ReducerType::value_type; - -#pragma omp declare reduction( \ - custominner:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamThread_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - -#pragma omp barrier - Impl::OpenMPTargetReducerWrapper::init(TeamThread_scratch[0]); -#pragma omp barrier - -#pragma omp for reduction(custominner : TeamThread_scratch[:1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, TeamThread_scratch[0]); - } - result.reference() = TeamThread_scratch[0]; -} -#else -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType result) { - using ValueType = typename ReducerType::value_type; - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamThread_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp barrier - ValueType tmp; - result.init(tmp); - TeamThread_scratch[0] = tmp; -#pragma omp barrier - - iType team_size = iType(omp_get_num_threads()); -#pragma omp for reduction(omp_red_teamthread_reducer \ - : TeamThread_scratch[:1]) schedule(static, 1) - for (iType t = 0; t < team_size; t++) { - ValueType tmp2; - result.init(tmp2); - - for (iType i = loop_boundaries.start + t; i < loop_boundaries.end; - i += team_size) { - lambda(i, tmp2); - } - - // FIXME_OPENMPTARGET: Join should work but doesn't. Every threads gets a - // private TeamThread_scratch[0] and at the end of the for-loop the `join` - // operation is performed by OpenMP itself and hence the simple assignment - // works. - // result.join(TeamThread_scratch[0], tmp2); - TeamThread_scratch[0] = tmp2; - } - - result.reference() = TeamThread_scratch[0]; -} -#endif // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND - -/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling thread - * and a reduction of val is performed using JoinType(ValueType& val, const - * ValueType& update) and put into init_result. The input value of init_result - * is used as initializer for temporary variables of ValueType. Therefore the - * input value should be the neutral element with respect to the join operation - * (e.g. '0 for +-' or '1 for *'). - */ -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& init_result) { - ValueType* TeamThread_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - // FIXME_OPENMPTARGET: Still need to figure out how to get value_count here. - const int value_count = 1; - -#pragma omp barrier - TeamThread_scratch[0] = init_result; -#pragma omp barrier - -#pragma omp for - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, TeamThread_scratch[omp_get_num_threads() * value_count]); - } - - // Reduce all partial results within a team. - const int team_size = omp_get_num_threads(); - int tree_neighbor_offset = 1; - do { -#pragma omp for - for (int i = 0; i < team_size - tree_neighbor_offset; - i += 2 * tree_neighbor_offset) { - const int neighbor = i + tree_neighbor_offset; - join(lambda, &TeamThread_scratch[i * value_count], - &TeamThread_scratch[neighbor * value_count]); - } - tree_neighbor_offset *= 2; - } while (tree_neighbor_offset < team_size); - init_result = TeamThread_scratch[0]; -} - -// This is largely the same code as in HIP and CUDA except for the member name -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds, - const FunctorType& lambda) { - using Analysis = Impl::FunctorAnalysis, - FunctorType>; - using value_type = typename Analysis::value_type; - - const auto start = loop_bounds.start; - const auto end = loop_bounds.end; - // Note this thing is called .member in the CUDA specialization of - // TeamThreadRangeBoundariesStruct - auto& member = loop_bounds.team; - const auto team_rank = member.team_rank(); - -#if defined(KOKKOS_IMPL_TEAM_SCAN_WORKAROUND) - value_type scan_val = value_type(); - - if (team_rank == 0) { - for (iType i = start; i < end; ++i) { - lambda(i, scan_val, true); - } - } -#pragma omp barrier -#else - const auto team_size = member.team_size(); - const auto nchunk = (end - start + team_size - 1) / team_size; - value_type accum = 0; - // each team has to process one or - // more chunks of the prefix scan - for (iType i = 0; i < nchunk; ++i) { - auto ii = start + i * team_size + team_rank; - // local accumulation for this chunk - value_type local_accum = 0; - // user updates value with prefix value - if (ii < loop_bounds.end) lambda(ii, local_accum, false); - // perform team scan - local_accum = member.team_scan(local_accum); - // add this blocks accum to total accumulation - auto val = accum + local_accum; - // user updates their data with total accumulation - if (ii < loop_bounds.end) lambda(ii, val, true); - // the last value needs to be propogated to next chunk - if (team_rank == team_size - 1) accum = val; - // broadcast last value to rest of the team - member.team_broadcast(accum, team_size - 1); - } -#endif -} - -} // namespace Kokkos - -namespace Kokkos { -/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda) { -#pragma omp simd - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); -} - -/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling thread - * and a summation of val is performed and put into result. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ValueType& result) { - ValueType vector_reduce = ValueType(); - - if constexpr (std::is_arithmetic::value) { -#pragma omp simd reduction(+ : vector_reduce) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - vector_reduce += tmp; - } - } else { -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) - -#pragma omp simd reduction(custom : vector_reduce) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, vector_reduce); - } - } - - result = vector_reduce; -} - -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType const& result) { - using ValueType = typename ReducerType::value_type; - -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) - - ValueType vector_reduce; - Impl::OpenMPTargetReducerWrapper::init(vector_reduce); - -#pragma omp simd reduction(custom : vector_reduce) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, vector_reduce); - } - - result.reference() = vector_reduce; -} - -/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling thread - * and a reduction of val is performed using JoinType(ValueType& val, const - * ValueType& update) and put into init_result. The input value of init_result - * is used as initializer for temporary variables of ValueType. Therefore the - * input value should be the neutral element with respect to the join operation - * (e.g. '0 for +-' or '1 for *'). - */ -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& init_result) { - ValueType result = init_result; - - // FIXME_OPENMPTARGET think about omp simd - // join does not work with omp reduction clause - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - join(result, tmp); - } - - init_result = result; -} - -/** \brief Intra-thread vector parallel exclusive prefix sum. Executes - * lambda(iType i, ValueType & val, bool final) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan - * operation is performed. Depending on the target execution space the operator - * might be called twice: once with final=false and once with final=true. When - * final==true val contains the prefix sum value. The contribution of this "i" - * needs to be added to val no matter whether final==true or not. In a serial - * execution (i.e. team_size==1) the operator is only called once with - * final==true. Scan_val will be set to the final sum value over all vector - * lanes. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const FunctorType& lambda) { - using Analysis = Impl::FunctorAnalysis, - FunctorType>; - using value_type = typename Analysis::value_type; - - value_type scan_val = value_type(); - -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif - for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) { - lambda(i, scan_val, true); - } -} - -} // namespace Kokkos - -#ifdef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND -#undef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND -#endif - -namespace Kokkos { -/** \brief Intra-team vector parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling team. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda) { -#pragma omp for simd nowait schedule(static, 1) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); -} - -/** \brief Intra-team vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling team - * and a summation of val is performed and put into result. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ValueType& result) { - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamVector_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - -#pragma omp barrier - TeamVector_scratch[0] = ValueType(); -#pragma omp barrier - - if constexpr (std::is_arithmetic::value) { -#pragma omp for simd reduction(+ : TeamVector_scratch[:1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - TeamVector_scratch[0] += tmp; - } - } else { -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) - -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - TeamVector_scratch[0] += tmp; - } - } - - result = TeamVector_scratch[0]; -} - -#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND) -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType const& result) { - using ValueType = typename ReducerType::value_type; - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) - - ValueType* TeamVector_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - -#pragma omp barrier - Impl::OpenMPTargetReducerWrapper::init(TeamVector_scratch[0]); -#pragma omp barrier - -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, TeamVector_scratch[0]); - } - - result.reference() = TeamVector_scratch[0]; -} -#else -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType const& result) { - using ValueType = typename ReducerType::value_type; - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamVector_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp barrier - ValueType tmp; - result.init(tmp); - TeamVector_scratch[0] = tmp; -#pragma omp barrier - - iType team_size = iType(omp_get_num_threads()); -#pragma omp for simd reduction(omp_red_teamthread_reducer \ - : TeamVector_scratch[:1]) schedule(static, 1) - for (iType t = 0; t < team_size; t++) { - ValueType tmp2; - result.init(tmp2); - - for (iType i = loop_boundaries.start + t; i < loop_boundaries.end; - i += team_size) { - lambda(i, tmp2); - } - TeamVector_scratch[0] = tmp2; - } - - result.reference() = TeamVector_scratch[0]; -} -#endif // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND -} // namespace Kokkos - -#ifdef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND -#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND -#endif - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION void single( - const Impl::VectorSingleStruct& - /*single_struct*/, - const FunctorType& lambda) { - lambda(); -} - -template -KOKKOS_INLINE_FUNCTION void single( - const Impl::ThreadSingleStruct& - single_struct, - const FunctorType& lambda) { - if (single_struct.team_member.team_rank() == 0) lambda(); -} - -template -KOKKOS_INLINE_FUNCTION void single( - const Impl::VectorSingleStruct& - /*single_struct*/, - const FunctorType& lambda, ValueType& val) { - lambda(val); -} - -template -KOKKOS_INLINE_FUNCTION void single( - const Impl::ThreadSingleStruct& - single_struct, - const FunctorType& lambda, ValueType& val) { - if (single_struct.team_member.team_rank() == 0) { - lambda(val); - } - single_struct.team_member.team_broadcast(val, 0); -} -} // namespace Kokkos - -#endif /* #ifndef KOKKOS_OPENMPTARGETEXEC_HPP */ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 4a33961205..564f299ab5 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -27,7 +27,7 @@ // constructor. undef'ed at the end #define KOKKOS_IMPL_OPENMPTARGET_WORKAROUND -#include +#include #include #include #include diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 71ce4b18f2..5e898727f1 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -20,1253 +20,648 @@ #include #include #include -#include +#include +#include + +#include +#include "Kokkos_OpenMPTarget_Abort.hpp" + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- namespace Kokkos { namespace Impl { -template -class ParallelFor, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; - - const FunctorType m_functor; - const Policy m_policy; +class OpenMPTargetExecTeamMember { + public: + static constexpr int TEAM_REDUCE_SIZE = 512; + + using execution_space = Kokkos::Experimental::OpenMPTarget; + using scratch_memory_space = execution_space::scratch_memory_space; + using team_handle = OpenMPTargetExecTeamMember; + + scratch_memory_space m_team_shared; + size_t m_team_scratch_size[2]; + int m_team_rank; + int m_team_size; + int m_league_rank; + int m_league_size; + int m_vector_length; + int m_vector_lane; + int m_shmem_block_index; + void* m_glb_scratch; + void* m_reduce_scratch; public: - void execute() const { execute_impl(); } - - template - void execute_impl() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const auto begin = m_policy.begin(); - const auto end = m_policy.end(); - - if (end <= begin) return; - - FunctorType a_functor(m_functor); - -#pragma omp target teams distribute parallel for map(to : a_functor) - for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { - a_functor(i); - } else { - a_functor(TagType(), i); - } - } + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_shmem() const { + return m_team_shared.set_team_thread_mode(0, 1, 0); } - ParallelFor(const FunctorType& arg_functor, Policy arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} -}; + // set_team_thread_mode routine parameters for future understanding: + // first parameter - scratch level. + // second parameter - size multiplier for advancing scratch ptr after a + // request was serviced. third parameter - offset size multiplier from current + // scratch ptr when returning a ptr for a request. + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_scratch(int level) const { + return m_team_shared.set_team_thread_mode(level, 1, 0); + } -} // namespace Impl -} // namespace Kokkos + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& thread_scratch(int level) const { + return m_team_shared.set_team_thread_mode(level, team_size(), team_rank()); + } -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; } + KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; } + KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; } + KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; } + KOKKOS_INLINE_FUNCTION void* impl_reduce_scratch() const { + return m_reduce_scratch; + } -namespace Kokkos { -namespace Impl { + KOKKOS_INLINE_FUNCTION void team_barrier() const { +#pragma omp barrier + } -// This class has the memcpy routine that is commonly used by ParallelReduce -// over RangePolicy and TeamPolicy. -template -struct ParallelReduceCommon { - // Copy the result back to device if the view is on the device. - static void memcpy_result(PointerType dest, PointerType src, size_t size, - bool ptr_on_device) { - if (ptr_on_device) { - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0, - omp_get_default_device(), - omp_get_initial_device())); - } else { - *dest = *src; - } + template + KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value, + int thread_id) const { + // Make sure there is enough scratch space: + using type = std::conditional_t<(sizeof(ValueType) < TEAM_REDUCE_SIZE), + ValueType, void>; + type* team_scratch = + reinterpret_cast(static_cast(m_glb_scratch) + + TEAM_REDUCE_SIZE * omp_get_team_num()); +#pragma omp barrier + if (team_rank() == thread_id) *team_scratch = value; +#pragma omp barrier + value = *team_scratch; } -}; -template -struct ParallelReduceSpecialize { - inline static void execute(const FunctorType& /*f*/, const PolicyType& /*p*/, - PointerType /*result_ptr*/) { - constexpr int FunctorHasJoin = - Impl::FunctorAnalysis::has_join_member_function; - constexpr int UseReducerType = is_reducer::value; - - std::stringstream error_message; - error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' ' - << UseReducerType << '\n'; - // FIXME_OPENMPTARGET - OpenMPTarget_abort(error_message.str().c_str()); + template + KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure& f, ValueType& value, + const int& thread_id) const { + f(value); + team_broadcast(value, thread_id); } -}; -template -struct ParallelReduceSpecialize, - ReducerType, PointerType, ValueType> { - using PolicyType = Kokkos::RangePolicy; - using TagType = typename PolicyType::work_tag; - using ReducerTypeFwd = - std::conditional_t::value, - FunctorType, ReducerType>; - using Analysis = Impl::FunctorAnalysis; - using ReferenceType = typename Analysis::reference_type; - - using ParReduceCommon = ParallelReduceCommon; - - static void execute_reducer(const FunctorType& f, const PolicyType& p, - PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const auto begin = p.begin(); - const auto end = p.end(); - - ValueType result; - OpenMPTargetReducerWrapper::init(result); - - // Initialize and copy back the result even if it is a zero length - // reduction. - if (end <= begin) { - ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - return; - } + // FIXME_OPENMPTARGET this function has the wrong interface and currently + // ignores the reducer passed. + template + KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value, + const JoinOp&) const { +#pragma omp barrier -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) - for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { - f(i, result); - } else { - f(TagType(), i, result); - } + using value_type = ValueType; + // const JoinLambdaAdapter op(op_in); + + // Make sure there is enough scratch space: + using type = std::conditional_t<(sizeof(value_type) < TEAM_REDUCE_SIZE), + value_type, void>; + + const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type); + type* team_scratch = + reinterpret_cast(static_cast(m_glb_scratch) + + TEAM_REDUCE_SIZE * omp_get_team_num()); + for (int i = m_team_rank; i < n_values; i += m_team_size) { + team_scratch[i] = value_type(); } - ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - } +#pragma omp barrier - template - static void execute_array(const FunctorType& f, const PolicyType& p, - PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const auto begin = p.begin(); - const auto end = p.end(); - - // Enter the loop if the reduction is on a scalar type. - if constexpr (NumReductions == 1) { - ValueType result = ValueType(); - - // Initialize and copy back the result even if it is a zero length - // reduction. - if (end <= begin) { - ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - return; - } - // Case where reduction is on a native data type. - if constexpr (std::is_arithmetic::value) { -#pragma omp target teams distribute parallel for \ - map(to:f) reduction(+: result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void::value) { - f(i, result); - } else { - f(TagType(), i, result); - } - } else { -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void::value) { - f(i, result); - } else { - f(TagType(), i, result); - } - } + for (int k = 0; k < m_team_size; k += n_values) { + if ((k <= m_team_rank) && (k + n_values > m_team_rank)) + team_scratch[m_team_rank % n_values] += value; +#pragma omp barrier + } - ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - } else { - ValueType result[NumReductions] = {}; - - // Initialize and copy back the result even if it is a zero length - // reduction. - if (end <= begin) { - ParReduceCommon::memcpy_result(result_ptr, result, - NumReductions * sizeof(ValueType), - ptr_on_device); - return; + for (int d = 1; d < n_values; d *= 2) { + if ((m_team_rank + d < n_values) && (m_team_rank % (2 * d) == 0)) { + team_scratch[m_team_rank] += team_scratch[m_team_rank + d]; } -#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions]) - for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { - f(i, result); - } else { - f(TagType(), i, result); - } - } - - ParReduceCommon::memcpy_result( - result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device); +#pragma omp barrier } + return team_scratch[0]; } + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template + KOKKOS_INLINE_FUNCTION ArgType + team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { + // FIXME_OPENMPTARGET + /* // Make sure there is enough scratch space: + using type = + std::conditional_t<(sizeof(ArgType) < TEAM_REDUCE_SIZE), ArgType, void>; - static void execute_init_join(const FunctorType& f, const PolicyType& p, - PointerType ptr, const bool ptr_on_device) { - const auto begin = p.begin(); - const auto end = p.end(); - - using FunctorAnalysis = - Impl::FunctorAnalysis; - constexpr int HasInit = FunctorAnalysis::has_init_member_function; - - // Initialize the result pointer. - - const auto size = end - begin; - - // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently - // based on NVIDIA-V100 and should be modifid to be based on the - // architecture in the future. - const int max_team_threads = 32; - const int max_teams = - OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads; - // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); - - // Allocate scratch per active thread. Achieved by setting the first - // parameter of `resize_scratch=1`. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - std::numeric_limits::max()); - ValueType* scratch_ptr = - static_cast(OpenMPTargetExec::get_scratch_ptr()); - -#pragma omp target map(to : f) is_device_ptr(scratch_ptr) - { - typename FunctorAnalysis::Reducer final_reducer(&f); - // Enter this loop if the functor has an `init` - if constexpr (HasInit) { - // The `init` routine needs to be called on the device since it might - // need device members. - final_reducer.init(scratch_ptr); - final_reducer.final(scratch_ptr); - } else { - for (int i = 0; i < value_count; ++i) { - static_cast(scratch_ptr)[i] = ValueType(); - } + volatile type * const work_value = ((type*) m_exec.scratch_thread()); - final_reducer.final(scratch_ptr); - } - } + *work_value = value ; - if (end <= begin) { - // If there is no work to be done, copy back the initialized values and - // exit. - if (!ptr_on_device) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_initial_device(), omp_get_default_device())); - else - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_default_device(), omp_get_default_device())); - - return; - } + memory_fence(); + + if ( team_fan_in() ) { + // The last thread to synchronize returns true, all other threads wait + for team_fan_out() + // m_team_base[0] == highest ranking team member + // m_team_base[ m_team_size - 1 ] == lowest ranking team member + // + // 1) copy from lower to higher rank, initialize lowest rank to zero + // 2) prefix sum from lowest to highest rank, skipping lowest rank + + type accum = 0 ; -#pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \ - map(to \ - : f) is_device_ptr(scratch_ptr) - { - typename FunctorAnalysis::Reducer final_reducer(&f); -#pragma omp parallel - { - const int team_num = omp_get_team_num(); - const int num_teams = omp_get_num_teams(); - const auto chunk_size = size / num_teams; - const auto team_begin = begin + team_num * chunk_size; - const auto team_end = - (team_num == num_teams - 1) ? end : (team_begin + chunk_size); - ValueType* team_scratch = - scratch_ptr + team_num * max_team_threads * value_count; - ReferenceType result = final_reducer.init( - &team_scratch[omp_get_thread_num() * value_count]); - - // Accumulate partial results in thread specific storage. -#pragma omp for simd - for (auto i = team_begin; i < team_end; ++i) { - if constexpr (std::is_void::value) { - f(i, result); - } else { - f(TagType(), i, result); + if ( global_accum ) { + for ( int i = m_team_size ; i-- ; ) { + type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i + )->scratch_thread()); accum += val ; } + accum = atomic_fetch_add( global_accum , accum ); } - // Reduce all paritial results within a team. - const int team_size = max_team_threads; - int tree_neighbor_offset = 1; - do { -#pragma omp for simd - for (int i = 0; i < team_size - tree_neighbor_offset; - i += 2 * tree_neighbor_offset) { - const int neighbor = i + tree_neighbor_offset; - final_reducer.join(&team_scratch[i * value_count], - &team_scratch[neighbor * value_count]); - } - tree_neighbor_offset *= 2; - } while (tree_neighbor_offset < team_size); - } // end parallel - } // end target - - int tree_neighbor_offset = 1; - do { -#pragma omp target teams distribute parallel for simd map(to \ - : f) \ - is_device_ptr(scratch_ptr) - for (int i = 0; i < max_teams - tree_neighbor_offset; - i += 2 * tree_neighbor_offset) { - typename FunctorAnalysis::Reducer final_reducer(&f); - ValueType* team_scratch = scratch_ptr; - const int team_offset = max_team_threads * value_count; - final_reducer.join( - &team_scratch[i * team_offset], - &team_scratch[(i + tree_neighbor_offset) * team_offset]); - - // If `final` is provided by the functor. - // Do the final only once at the end. - if (tree_neighbor_offset * 2 >= max_teams && omp_get_team_num() == 0 && - omp_get_thread_num() == 0) { - final_reducer.final(scratch_ptr); + for ( int i = m_team_size ; i-- ; ) { + type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i + )->scratch_thread()); const type offset = accum ; accum += val ; val = + offset ; } + + memory_fence(); } - tree_neighbor_offset *= 2; - } while (tree_neighbor_offset < max_teams); - - // If the result view is on the host, copy back the values via memcpy. - if (!ptr_on_device) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_initial_device(), omp_get_default_device())); - else - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_default_device(), omp_get_default_device())); - } -}; -template -class ParallelReduce, ReducerType, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::RangePolicy; - - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - - using ReducerTypeFwd = - std::conditional_t::value, - FunctorType, ReducerType>; - using Analysis = Impl::FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - - static constexpr int HasJoin = - Impl::FunctorAnalysis::has_join_member_function; - static constexpr int UseReducer = is_reducer::value; - static constexpr int IsArray = std::is_pointer::value; - - using ParReduceSpecialize = - ParallelReduceSpecialize; - - const FunctorType m_functor; - const Policy m_policy; - const ReducerType m_reducer; - const pointer_type m_result_ptr; - bool m_result_ptr_on_device; - const int m_result_ptr_num_elems; - using TagType = typename Policy::work_tag; + team_fan_out(); - public: - void execute() const { - if constexpr (HasJoin) { - // Enter this loop if the Functor has a init-join. - ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr, - m_result_ptr_on_device); - } else if constexpr (UseReducer) { - // Enter this loop if the Functor is a reducer type. - ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr, - m_result_ptr_on_device); - } else if constexpr (IsArray) { - // Enter this loop if the reduction is on an array and the routine is - // templated over the size of the array. - if (m_result_ptr_num_elems <= 2) { - ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 4) { - ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 8) { - ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 16) { - ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 32) { - ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else { - Kokkos::abort("array reduction length must be <= 32"); - } - } else { - // This loop handles the basic scalar reduction. - ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } + return *work_value ;*/ + return ArgType(); } - template - ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, - const ViewType& arg_result_view, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = nullptr) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(InvalidType()), - m_result_ptr(arg_result_view.data()), - m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result_view.size()) {} - - ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, - const ReducerType& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_result_ptr_num_elems(reducer.view().size()) {} -}; - -} // namespace Impl -} // namespace Kokkos + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template + KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { + return this->template team_scan(value, 0); + } -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + //---------------------------------------- + // Private for the driver -namespace Kokkos { -namespace Impl { + private: + using space = execution_space::scratch_memory_space; -template -class ParallelScan, - Kokkos::Experimental::OpenMPTarget> { - protected: - using Policy = Kokkos::RangePolicy; + public: + // FIXME_OPENMPTARGET - 512(16*32) bytes at the begining of the scratch space + // for each league is saved for reduction. It should actually be based on the + // ValueType of the reduction variable. + inline OpenMPTargetExecTeamMember( + const int league_rank, const int league_size, const int team_size, + const int vector_length // const TeamPolicyInternal< OpenMPTarget, + // Properties ...> & team + , + void* const glb_scratch, const int shmem_block_index, + const size_t shmem_size_L0, const size_t shmem_size_L1) + : m_team_scratch_size{shmem_size_L0, shmem_size_L1}, + m_team_rank(0), + m_team_size(team_size), + m_league_rank(league_rank), + m_league_size(league_size), + m_vector_length(vector_length), + m_shmem_block_index(shmem_block_index), + m_glb_scratch(glb_scratch) { + const int omp_tid = omp_get_thread_num(); + + // The scratch memory allocated is a sum of TEAM_REDUCE_SIZE, L0 shmem size + // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for + // hierarchical reduction. There is an additional 10% of the requested + // scratch memory allocated per team as padding. Hence the product with 0.1. + const int reduce_offset = + m_shmem_block_index * + (shmem_size_L0 + shmem_size_L1 + + ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE); + const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE; + const int l1_offset = l0_offset + shmem_size_L0; + m_team_shared = scratch_memory_space( + (static_cast(glb_scratch) + l0_offset), shmem_size_L0, + static_cast(glb_scratch) + l1_offset, shmem_size_L1); + m_reduce_scratch = static_cast(glb_scratch) + reduce_offset; + m_league_rank = league_rank; + m_team_rank = omp_tid; + m_vector_lane = 0; + } - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; - using idx_type = typename Policy::index_type; + static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; } +}; - using Analysis = Impl::FunctorAnalysis; +template +class TeamPolicyInternal + : public PolicyTraits { + public: + //! Tag this class as a kokkos execution policy + using execution_policy = TeamPolicyInternal; - using value_type = typename Analysis::value_type; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; + using traits = PolicyTraits; - const FunctorType m_functor; - const Policy m_policy; + //---------------------------------------- - value_type* m_result_ptr; - const bool m_result_ptr_device_accessible; + template + inline static int team_size_max(const FunctorType&, const ParallelForTag&) { + return 256; + } - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(idx, val, is_final); + template + inline static int team_size_max(const FunctorType&, + const ParallelReduceTag&) { + return 256; } - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(WorkTag(), idx, val, is_final); + + template + inline static int team_size_max(const FunctorType&, const ReducerType&, + const ParallelReduceTag&) { + return 256; } - public: - void impl_execute( - Kokkos::View - element_values, - Kokkos::View - chunk_values, - Kokkos::View count) - const { - const idx_type N = m_policy.end() - m_policy.begin(); - const idx_type chunk_size = 128; - const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; - idx_type nteams = n_chunks > 512 ? 512 : n_chunks; - idx_type team_size = 128; - - FunctorType a_functor(m_functor); -#pragma omp target teams distribute map(to \ - : a_functor) num_teams(nteams) \ - thread_limit(team_size) - for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { - typename Analysis::Reducer final_reducer(&a_functor); -#pragma omp parallel num_threads(team_size) - { - const idx_type local_offset = team_id * chunk_size; - -#pragma omp for - for (idx_type i = 0; i < chunk_size; ++i) { - const idx_type idx = local_offset + i; - value_type val; - final_reducer.init(&val); - if (idx < N) call_with_tag(a_functor, idx, val, false); - element_values(team_id, i) = val; - } -#pragma omp barrier - if (omp_get_thread_num() == 0) { - value_type sum; - final_reducer.init(&sum); - for (idx_type i = 0; i < chunk_size; ++i) { - final_reducer.join(&sum, &element_values(team_id, i)); - element_values(team_id, i) = sum; - } - chunk_values(team_id) = sum; - } -#pragma omp barrier - if (omp_get_thread_num() == 0) { - if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) { - value_type sum; - final_reducer.init(&sum); - for (idx_type i = 0; i < n_chunks; ++i) { - final_reducer.join(&sum, &chunk_values(i)); - chunk_values(i) = sum; - } - } - } - } - } + template + inline static int team_size_recommended(const FunctorType&, + const ParallelForTag&) { + return 128; + } -#pragma omp target teams distribute map(to \ - : a_functor) num_teams(nteams) \ - thread_limit(team_size) - for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { - typename Analysis::Reducer final_reducer(&a_functor); -#pragma omp parallel num_threads(team_size) - { - const idx_type local_offset = team_id * chunk_size; - value_type offset_value; - if (team_id > 0) - offset_value = chunk_values(team_id - 1); - else - final_reducer.init(&offset_value); - -#pragma omp for - for (idx_type i = 0; i < chunk_size; ++i) { - const idx_type idx = local_offset + i; - value_type local_offset_value; - if (i > 0) { - local_offset_value = element_values(team_id, i - 1); - // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs -#ifdef KOKKOS_ARCH_VEGA - if constexpr (Analysis::has_join_member_function) { - if constexpr (std::is_void_v) - a_functor.join(local_offset_value, offset_value); - else - a_functor.join(WorkTag{}, local_offset_value, offset_value); - } else - local_offset_value += offset_value; -#else - final_reducer.join(&local_offset_value, &offset_value); -#endif - } else - local_offset_value = offset_value; - if (idx < N) - call_with_tag(a_functor, idx, local_offset_value, true); - if (idx == N - 1 && m_result_ptr_device_accessible) - *m_result_ptr = local_offset_value; - } - } - } + template + inline static int team_size_recommended(const FunctorType&, + const ParallelReduceTag&) { + return 128; } - void execute() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const idx_type N = m_policy.end() - m_policy.begin(); - const idx_type chunk_size = 128; - const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; - - // This could be scratch memory per team - Kokkos::View - element_values("element_values", n_chunks, chunk_size); - Kokkos::View - chunk_values("chunk_values", n_chunks); - Kokkos::View count( - "Count"); - - impl_execute(element_values, chunk_values, count); + template + inline static int team_size_recommended(const FunctorType&, + const ReducerType&, + const ParallelReduceTag&) { + return 128; } //---------------------------------------- - ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy, - pointer_type arg_result_ptr = nullptr, - bool arg_result_ptr_device_accessible = false) - : m_functor(arg_functor), - m_policy(arg_policy), - m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} - - //---------------------------------------- -}; + private: + int m_league_size; + int m_team_size; + int m_vector_length; + int m_team_alloc; + int m_team_iter; + std::array m_team_scratch_size; + std::array m_thread_scratch_size; + bool m_tune_team_size; + bool m_tune_vector_length; + constexpr const static size_t default_team_size = 256; + int m_chunk_size; + + inline void init(const int league_size_request, const int team_size_request, + const int vector_length_request) { + m_league_size = league_size_request; + + // Minimum team size should be 32 for OpenMPTarget backend. + if (team_size_request < 32) { + Kokkos::Impl::OpenMPTarget_abort( + "OpenMPTarget backend requires a minimum of 32 threads per team.\n"); + } else + m_team_size = team_size_request; + + m_vector_length = vector_length_request; + set_auto_chunk_size(); + } -template -class ParallelScanWithTotal, - ReturnType, Kokkos::Experimental::OpenMPTarget> - : public ParallelScan, - Kokkos::Experimental::OpenMPTarget> { - using base_t = ParallelScan, - Kokkos::Experimental::OpenMPTarget>; - using value_type = typename base_t::value_type; + template + friend class TeamPolicyInternal; public: - void execute() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); - const int chunk_size = 128; - const int64_t n_chunks = (N + chunk_size - 1) / chunk_size; - - if (N > 0) { - // This could be scratch memory per team - Kokkos::View - element_values("element_values", n_chunks, chunk_size); - Kokkos::View - chunk_values("chunk_values", n_chunks); - Kokkos::View count( - "Count"); - - base_t::impl_execute(element_values, chunk_values, count); - - if (!base_t::m_result_ptr_device_accessible) { - const int size = base_t::Analysis::value_size(base_t::m_functor); - DeepCopy( - base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size); - } - } else if (!base_t::m_result_ptr_device_accessible) { - *base_t::m_result_ptr = 0; - } + // FIXME_OPENMPTARGET : Currently this routine is a copy of the Cuda + // implementation, but this has to be tailored to be architecture specific. + inline static int scratch_size_max(int level) { + return ( + level == 0 ? 1024 * 40 : // 48kB is the max for CUDA, but we need some + // for team_member.reduce etc. + 20 * 1024 * + 1024); // arbitrarily setting this to 20MB, for a Volta V100 + // that would give us about 3.2GB for 2 teams per SM } - - template - ParallelScanWithTotal(const FunctorType& arg_functor, - const typename base_t::Policy& arg_policy, - const ViewType& arg_result_view) - : base_t(arg_functor, arg_policy, arg_result_view.data(), - MemorySpaceAccess::accessible) { + inline bool impl_auto_team_size() const { return m_tune_team_size; } + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline void impl_set_team_size(const size_t size) { m_team_size = size; } + inline void impl_set_vector_length(const size_t length) { + m_tune_vector_length = length; + } + inline int impl_vector_length() const { return m_vector_length; } + inline int team_size() const { return m_team_size; } + inline int league_size() const { return m_league_size; } + inline size_t scratch_size(const int& level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; } -}; -} // namespace Impl -} // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + inline Kokkos::Experimental::OpenMPTarget space() const { + return Kokkos::Experimental::OpenMPTarget(); + } -namespace Kokkos { -namespace Impl { + template + TeamPolicyInternal(const TeamPolicyInternal& p) + : m_league_size(p.m_league_size), + m_team_size(p.m_team_size), + m_vector_length(p.m_vector_length), + m_team_alloc(p.m_team_alloc), + m_team_iter(p.m_team_iter), + m_team_scratch_size(p.m_team_scratch_size), + m_thread_scratch_size(p.m_thread_scratch_size), + m_tune_team_size(p.m_tune_team_size), + m_tune_vector_length(p.m_tune_vector_length), + m_chunk_size(p.m_chunk_size) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, int team_size_request, + int vector_length_request = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(false), + m_tune_vector_length(false), + m_chunk_size(0) { + init(league_size_request, team_size_request, vector_length_request); + } -template -class ParallelFor, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = - Kokkos::Impl::TeamPolicyInternal; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + int vector_length_request = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(true), + m_tune_vector_length(false), + m_chunk_size(0) { + init(league_size_request, default_team_size / vector_length_request, + vector_length_request); + } - const FunctorType m_functor; - const Policy m_policy; - const size_t m_shmem_size; + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(true), + m_tune_vector_length(true), + m_chunk_size(0) { + init(league_size_request, default_team_size, 1); + } + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(false), + m_tune_vector_length(true), + m_chunk_size(0) { + init(league_size_request, team_size_request, 1); + } - public: - void execute() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - execute_impl(); + TeamPolicyInternal(int league_size_request, int team_size_request, + int vector_length_request = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(false), + m_tune_vector_length(false), + m_chunk_size(0) { + init(league_size_request, team_size_request, vector_length_request); } - private: - template - void execute_impl() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const auto league_size = m_policy.league_size(); - const auto team_size = m_policy.team_size(); - const auto vector_length = m_policy.impl_vector_length(); - - const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); - const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1, - league_size); - - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - FunctorType a_functor(m_functor); - - // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the - // scratch implementation does not work in the Release or RelWithDebugInfo - // mode but works in the Debug mode. - - // Maximum active teams possible. - int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; - // nteams should not exceed the maximum in-flight teams possible. - const auto nteams = - league_size < max_active_teams ? league_size : max_active_teams; - - // If the league size is <=0, do not launch the kernel. - if (nteams <= 0) return; - -// Performing our own scheduling of teams to avoid separation of code between -// teams-distribute and parallel. Gave a 2x performance boost in test cases with -// the clang compiler. atomic_compare_exchange can be avoided since the standard -// guarantees that the number of teams specified in the `num_teams` clause is -// always less than or equal to the maximum concurrently running teams. -#pragma omp target teams num_teams(nteams) thread_limit(team_size) \ - map(to \ - : a_functor) is_device_ptr(scratch_ptr) -#pragma omp parallel - { - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Iterate through the number of teams until league_size and assign the - // league_id accordingly - // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename Policy::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - m_functor(team); - else - m_functor(TagType(), team); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); - } + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + int vector_length_request = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(true), + m_tune_vector_length(false), + m_chunk_size(0) { + init(league_size_request, default_team_size / vector_length_request, + vector_length_request); } - public: - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) - : m_functor(arg_functor), - m_policy(arg_policy), - m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, arg_policy.team_size())) {} -}; + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(true), + m_tune_vector_length(true), + m_chunk_size(0) { + init(league_size_request, default_team_size, 1); + } + TeamPolicyInternal(int league_size_request, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(false), + m_tune_vector_length(true), + m_chunk_size(0) { + init(league_size_request, team_size_request, 1); + } + inline static size_t vector_length_max() { + return 32; /* TODO: this is bad. Need logic that is compiler and backend + aware */ + } + inline int team_alloc() const { return m_team_alloc; } + inline int team_iter() const { return m_team_iter; } -template -struct ParallelReduceSpecialize, - ReducerType, PointerType, ValueType> { - using PolicyType = TeamPolicyInternal; - using TagType = typename PolicyType::work_tag; - using ReducerTypeFwd = - std::conditional_t::value, - FunctorType, ReducerType>; - using Analysis = Impl::FunctorAnalysis; - - using ReferenceType = typename Analysis::reference_type; - - using ParReduceCommon = ParallelReduceCommon; - - static void execute_reducer(const FunctorType& f, const PolicyType& p, - PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - - const int league_size = p.league_size(); - const int team_size = p.team_size(); - const int vector_length = p.impl_vector_length(); - - const size_t shmem_size_L0 = p.scratch_size(0, team_size); - const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - - ValueType result = ValueType(); - - // Maximum active teams possible. - int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; - const auto nteams = - league_size < max_active_teams ? league_size : max_active_teams; - - // If the league size is <=0, do not launch the kernel. - if (nteams <= 0) return; - -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(custom \ - : result) -#pragma omp parallel reduction(custom : result) - { - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); - } + inline int chunk_size() const { return m_chunk_size; } - // Copy results back to device if `parallel_reduce` is on a device view. - ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); + /** \brief set chunk_size to a discrete value*/ + inline TeamPolicyInternal& set_chunk_size( + typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; } - template - static void execute_array(const FunctorType& f, const PolicyType& p, - PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - - const int league_size = p.league_size(); - const int team_size = p.team_size(); - const int vector_length = p.impl_vector_length(); - - const size_t shmem_size_L0 = p.scratch_size(0, team_size); - const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - - // Maximum active teams possible. - int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; - const auto nteams = - league_size < max_active_teams ? league_size : max_active_teams; - - // If the league size is <=0, do not launch the kernel. - if (nteams <= 0) return; - - // Case where the number of reduction items is 1. - if constexpr (NumReductions == 1) { - ValueType result = ValueType(); - - // Case where reduction is on a native data type. - if constexpr (std::is_arithmetic::value) { -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+: result) -#pragma omp parallel reduction(+ : result) - { - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); - } - } else { - // Case where the reduction is on a non-native data type. -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(custom \ - : result) -#pragma omp parallel reduction(custom : result) - { - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); - } - } + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size(const int& level, + const PerTeamValue& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } - // Copy results back to device if `parallel_reduce` is on a device view. - ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - } else { - ValueType result[NumReductions] = {}; - // Case where the reduction is on an array. -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions]) -#pragma omp parallel reduction(+ : result[:NumReductions]) - { - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); - } + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerThreadValue& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } - // Copy results back to device if `parallel_reduce` is on a device view. - ParReduceCommon::memcpy_result( - result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device); - } + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerTeamValue& per_team, + const PerThreadValue& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; } - // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over - // RangePolicy. Need a new implementation. - static void execute_init_join(const FunctorType& f, const PolicyType& p, - PointerType ptr, const bool ptr_on_device) { - using FunctorAnalysis = - Impl::FunctorAnalysis; - constexpr int HasInit = FunctorAnalysis::has_init_member_function; - - const int league_size = p.league_size(); - const int team_size = p.team_size(); - const int vector_length = p.impl_vector_length(); - - auto begin = 0; - auto end = league_size * team_size + team_size * vector_length; - - const size_t shmem_size_L0 = p.scratch_size(0, team_size); - const size_t shmem_size_L1 = p.scratch_size(1, team_size); - - // FIXME_OPENMPTARGET: This would oversubscribe scratch memory since we are - // already using the available scratch memory to create temporaries for each - // thread. - if ((shmem_size_L0 + shmem_size_L1) > 0) { - Kokkos::abort( - "OpenMPTarget: Scratch memory is not supported in `parallel_reduce` " - "over functors with init/join."); - } + private: + /** \brief finalize chunk_size if it was set to AUTO*/ + inline void set_auto_chunk_size() { + int concurrency = 2048 * 128; - const auto nteams = league_size; - - // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); - - // Allocate scratch per active thread. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - - // Enter this loop if the functor has an `init` - if constexpr (HasInit) { - // The `init` routine needs to be called on the device since it might need - // device members. -#pragma omp target map(to : f) is_device_ptr(scratch_ptr) - { - typename FunctorAnalysis::Reducer final_reducer(&f); - final_reducer.init(scratch_ptr); - final_reducer.final(scratch_ptr); - } - } else { -#pragma omp target map(to : f) is_device_ptr(scratch_ptr) - { - for (int i = 0; i < value_count; ++i) { - static_cast(scratch_ptr)[i] = ValueType(); - } + if (concurrency == 0) concurrency = 1; - typename FunctorAnalysis::Reducer final_reducer(&f); - final_reducer.final(static_cast(scratch_ptr)); - } + if (m_chunk_size > 0) { + if (!Impl::is_integral_power_of_two(m_chunk_size)) + Kokkos::abort("TeamPolicy blocking granularity must be power of two"); } - if (end <= begin) { - // If there is no work to be done, copy back the initialized values and - // exit. - if (!ptr_on_device) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_initial_device(), omp_get_default_device())); - else - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_default_device(), omp_get_default_device())); - - return; + int new_chunk_size = 1; + while (new_chunk_size * 100 * concurrency < m_league_size) + new_chunk_size *= 2; + if (new_chunk_size < 128) { + new_chunk_size = 1; + while ((new_chunk_size * 40 * concurrency < m_league_size) && + (new_chunk_size < 128)) + new_chunk_size *= 2; } - -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) - { -#pragma omp parallel - { - const int team_num = omp_get_team_num(); - const int num_teams = omp_get_num_teams(); - ValueType* team_scratch = static_cast(scratch_ptr) + - team_num * team_size * value_count; - typename FunctorAnalysis::Reducer final_reducer(&f); - ReferenceType result = final_reducer.init(&team_scratch[0]); - - for (int league_id = team_num; league_id < league_size; - league_id += num_teams) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - team_num, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) { - f(team, result); - } else { - f(TagType(), team, result); - } - } - } // end parallel - } // end target - - int tree_neighbor_offset = 1; - do { -#pragma omp target teams distribute parallel for simd map(to \ - : f) \ - is_device_ptr(scratch_ptr) - for (int i = 0; i < nteams - tree_neighbor_offset; - i += 2 * tree_neighbor_offset) { - ValueType* team_scratch = static_cast(scratch_ptr); - const int team_offset = team_size * value_count; - typename FunctorAnalysis::Reducer final_reducer(&f); - final_reducer.join( - &team_scratch[i * team_offset], - &team_scratch[(i + tree_neighbor_offset) * team_offset]); - - // If `final` is provided by the functor. - // Do the final only once at the end. - if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 && - omp_get_thread_num() == 0) { - final_reducer.final(scratch_ptr); - } - } - tree_neighbor_offset *= 2; - } while (tree_neighbor_offset < nteams); - - // If the result view is on the host, copy back the values via memcpy. - if (!ptr_on_device) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_initial_device(), omp_get_default_device())); - else - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_default_device(), omp_get_default_device())); + m_chunk_size = new_chunk_size; } -}; - -template -class ParallelReduce, - ReducerType, Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = - Kokkos::Impl::TeamPolicyInternal; - - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using ReducerTypeFwd = - std::conditional_t::value, - FunctorType, ReducerType>; - using WorkTagFwd = - std::conditional_t::value, WorkTag, - void>; - using Analysis = Impl::FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - using value_type = typename Analysis::value_type; - - bool m_result_ptr_on_device; - const int m_result_ptr_num_elems; - - static constexpr int HasJoin = - Impl::FunctorAnalysis::has_join_member_function; - static constexpr int UseReducer = is_reducer::value; - static constexpr int IsArray = std::is_pointer::value; - - using ParReduceSpecialize = - ParallelReduceSpecialize; - - const FunctorType m_functor; - const Policy m_policy; - const ReducerType m_reducer; - const pointer_type m_result_ptr; - const size_t m_shmem_size; public: - void execute() const { - if constexpr (HasJoin) { - ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr, - m_result_ptr_on_device); - } else if constexpr (UseReducer) { - ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr, - m_result_ptr_on_device); - } else if constexpr (IsArray) { - if (m_result_ptr_num_elems <= 2) { - ParReduceSpecialize::template execute_array<2>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 4) { - ParReduceSpecialize::template execute_array<4>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 8) { - ParReduceSpecialize::template execute_array<8>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 16) { - ParReduceSpecialize::template execute_array<16>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 32) { - ParReduceSpecialize::template execute_array<32>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else { - Kokkos::abort("array reduction length must be <= 32"); - } - } else { - ParReduceSpecialize::template execute_array<1>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } - } - - template - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ViewType& arg_result, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = nullptr) - : m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result.size()), - m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(InvalidType()), - m_result_ptr(arg_result.data()), - m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, arg_policy.team_size())) {} - - ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, - const ReducerType& reducer) - : m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_result_ptr_num_elems(reducer.view().size()), - m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, arg_policy.team_size())) {} + using member_type = Impl::OpenMPTargetExecTeamMember; }; } // namespace Impl } // namespace Kokkos +namespace Kokkos { + +template +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember> +TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType& count) { + return Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, count); +} + +template +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + std::common_type_t, Impl::OpenMPTargetExecTeamMember> +TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType1& begin, const iType2& end) { + using iType = std::common_type_t; + return Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(begin), + iType(end)); +} + +template +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember> +ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType& count) { + return Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, count); +} + +template +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + std::common_type_t, Impl::OpenMPTargetExecTeamMember> +ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType1& arg_begin, const iType2& arg_end) { + using iType = std::common_type_t; + return Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin), + iType(arg_end)); +} + +template +KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember> +TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType& count) { + return Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, count); +} + +template +KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< + std::common_type_t, Impl::OpenMPTargetExecTeamMember> +TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType1& arg_begin, const iType2& arg_end) { + using iType = std::common_type_t; + return Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin), + iType(arg_end)); +} + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct PerTeam( + const Impl::OpenMPTargetExecTeamMember& thread) { + return Impl::ThreadSingleStruct(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct PerThread( + const Impl::OpenMPTargetExecTeamMember& thread) { + return Impl::VectorSingleStruct(thread); +} +} // namespace Kokkos + +namespace Kokkos { + +template +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct& + /*single_struct*/, + const FunctorType& lambda) { + lambda(); +} + +template +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct& + single_struct, + const FunctorType& lambda) { + if (single_struct.team_member.team_rank() == 0) lambda(); +} + +template +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct& + /*single_struct*/, + const FunctorType& lambda, ValueType& val) { + lambda(val); +} + +template +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct& + single_struct, + const FunctorType& lambda, ValueType& val) { + if (single_struct.team_member.team_rank() == 0) { + lambda(val); + } + single_struct.team_member.team_broadcast(val, 0); +} +} // namespace Kokkos + namespace Kokkos { namespace Impl { @@ -1320,5 +715,44 @@ struct TeamVectorRangeBoundariesStruct { } // namespace Kokkos //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +/** \brief Data for OpenMPTarget thread execution */ + +class OpenMPTargetExec { + public: + // FIXME_OPENMPTARGET - Currently the maximum number of + // teams possible is calculated based on NVIDIA's Volta GPU. In + // future this value should be based on the chosen architecture for the + // OpenMPTarget backend. + static constexpr int MAX_ACTIVE_THREADS = 2080 * 80; + static constexpr int MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS / 32; + + private: + static void* scratch_ptr; + + public: + static void verify_is_process(const char* const); + static void verify_initialized(const char* const); + + static int* get_lock_array(int num_teams); + static void* get_scratch_ptr(); + static void clear_scratch(); + static void clear_lock_array(); + static void resize_scratch(int64_t team_reduce_bytes, + int64_t team_shared_bytes, + int64_t thread_local_bytes, int64_t league_size); + + static void* m_scratch_ptr; + static int64_t m_scratch_size; + static int* m_lock_array; + static int64_t m_lock_size; + static uint32_t* m_uniquetoken_ptr; +}; + +} // namespace Impl +} // namespace Kokkos #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp new file mode 100644 index 0000000000..fcf168e9c9 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp @@ -0,0 +1,72 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLEL_FOR_RANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLEL_FOR_RANGE_HPP + +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::RangePolicy; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const Policy m_policy; + + public: + void execute() const { execute_impl(); } + + template + void execute_impl() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const auto begin = m_policy.begin(); + const auto end = m_policy.end(); + + if (end <= begin) return; + + FunctorType a_functor(m_functor); + +#pragma omp target teams distribute parallel for map(to : a_functor) + for (auto i = begin; i < end; ++i) { + if constexpr (std::is_void::value) { + a_functor(i); + } else { + a_functor(TagType(), i); + } + } + } + + ParallelFor(const FunctorType& arg_functor, Policy arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp new file mode 100644 index 0000000000..12de3423f8 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -0,0 +1,170 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLEL_FOR_TEAM_HPP +#define KOKKOS_OPENMPTARGET_PARALLEL_FOR_TEAM_HPP + +#include +#include +#include +#include + +namespace Kokkos { + +/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda) { +#pragma omp for nowait schedule(static, 1) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); +} + +/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda) { +#pragma omp simd + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); +} + +/** \brief Intra-team vector parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling team. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda) { +#pragma omp for simd nowait schedule(static, 1) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); +} + +namespace Impl { + +template +class ParallelFor, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = + Kokkos::Impl::TeamPolicyInternal; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const Policy m_policy; + const size_t m_shmem_size; + + public: + void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + execute_impl(); + } + + private: + template + void execute_impl() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const auto league_size = m_policy.league_size(); + const auto team_size = m_policy.team_size(); + const auto vector_length = m_policy.impl_vector_length(); + + const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); + const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); + OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1, + league_size); + + void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + FunctorType a_functor(m_functor); + + // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the + // scratch implementation does not work in the Release or RelWithDebugInfo + // mode but works in the Debug mode. + + // Maximum active teams possible. + int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; + // nteams should not exceed the maximum in-flight teams possible. + const auto nteams = + league_size < max_active_teams ? league_size : max_active_teams; + + // If the league size is <=0, do not launch the kernel. + if (nteams <= 0) return; + +// Performing our own scheduling of teams to avoid separation of code between +// teams-distribute and parallel. Gave a 2x performance boost in test cases with +// the clang compiler. atomic_compare_exchange can be avoided since the standard +// guarantees that the number of teams specified in the `num_teams` clause is +// always less than or equal to the maximum concurrently running teams. +#pragma omp target teams num_teams(nteams) thread_limit(team_size) \ + map(to \ + : a_functor) is_device_ptr(scratch_ptr) +#pragma omp parallel + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Iterate through the number of teams until league_size and assign the + // league_id accordingly + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename Policy::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void::value) + m_functor(team); + else + m_functor(TagType(), team); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + } + + public: + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize::value( + arg_functor, arg_policy.team_size())) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp new file mode 100644 index 0000000000..1ac46b9919 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -0,0 +1,133 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_RANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_RANGE_HPP + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelReduce, ReducerType, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::RangePolicy; + + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + + using ReducerTypeFwd = + std::conditional_t::value, + FunctorType, ReducerType>; + using Analysis = Impl::FunctorAnalysis; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + static constexpr int HasJoin = + Impl::FunctorAnalysis::has_join_member_function; + static constexpr int UseReducer = is_reducer::value; + static constexpr int IsArray = std::is_pointer::value; + + using ParReduceSpecialize = + ParallelReduceSpecialize; + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + bool m_result_ptr_on_device; + const int m_result_ptr_num_elems; + using TagType = typename Policy::work_tag; + + public: + void execute() const { + if constexpr (HasJoin) { + // Enter this loop if the Functor has a init-join. + ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr, + m_result_ptr_on_device); + } else if constexpr (UseReducer) { + // Enter this loop if the Functor is a reducer type. + ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr, + m_result_ptr_on_device); + } else if constexpr (IsArray) { + // Enter this loop if the reduction is on an array and the routine is + // templated over the size of the array. + if (m_result_ptr_num_elems <= 2) { + ParReduceSpecialize::template execute_array( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 4) { + ParReduceSpecialize::template execute_array( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 8) { + ParReduceSpecialize::template execute_array( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 16) { + ParReduceSpecialize::template execute_array( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 32) { + ParReduceSpecialize::template execute_array( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else { + Kokkos::abort("array reduction length must be <= 32"); + } + } else { + // This loop handles the basic scalar reduction. + ParReduceSpecialize::template execute_array( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } + } + + template + ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, + const ViewType& arg_result_view, + std::enable_if_t::value && + !Kokkos::is_reducer::value, + void*> = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result_view.data()), + m_result_ptr_on_device( + MemorySpaceAccess::accessible), + m_result_ptr_num_elems(arg_result_view.size()) {} + + ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_result_ptr_on_device( + MemorySpaceAccess::accessible), + m_result_ptr_num_elems(reducer.view().size()) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp new file mode 100644 index 0000000000..236c6d6f7a --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -0,0 +1,551 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_TEAM_HPP +#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_TEAM_HPP + +#include +#include +#include +#include +#include + +// FIXME_OPENMPTARGET - Using this macro to implement a workaround for +// hierarchical reducers. It avoids hitting the code path which we wanted to +// write but doesn't work. undef'ed at the end. +// Intel compilers prefer the non-workaround version. +#ifndef KOKKOS_ARCH_INTEL_GPU +#define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND +#endif + +namespace Kokkos { + +/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team + * and a summation of val is performed and put into result. + */ + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value> +parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamThread_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp barrier + TeamThread_scratch[0] = ValueType(); +#pragma omp barrier + + if constexpr (std::is_arithmetic::value) { +#pragma omp for reduction(+ : TeamThread_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamThread_scratch[0] += tmp; + } + } else { +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) + +#pragma omp for reduction(custom : TeamThread_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamThread_scratch[0] += tmp; + } + } + + result = TeamThread_scratch[0]; +} + +#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND) +// For some reason the actual version we wanted to write doesn't work +// and crashes. We should try this with every new compiler +// This is the variant we actually wanted to write +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value> +parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType result) { + using ValueType = typename ReducerType::value_type; + +#pragma omp declare reduction( \ + custominner:ValueType \ + : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamThread_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp barrier + Impl::OpenMPTargetReducerWrapper::init(TeamThread_scratch[0]); +#pragma omp barrier + +#pragma omp for reduction(custominner : TeamThread_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, TeamThread_scratch[0]); + } + result.reference() = TeamThread_scratch[0]; +} +#else +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value> +parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType result) { + using ValueType = typename ReducerType::value_type; + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamThread_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp declare reduction( \ + omp_red_teamthread_reducer:ValueType \ + : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp barrier + ValueType tmp; + result.init(tmp); + TeamThread_scratch[0] = tmp; +#pragma omp barrier + + iType team_size = iType(omp_get_num_threads()); +#pragma omp for reduction(omp_red_teamthread_reducer \ + : TeamThread_scratch[:1]) schedule(static, 1) + for (iType t = 0; t < team_size; t++) { + ValueType tmp2; + result.init(tmp2); + + for (iType i = loop_boundaries.start + t; i < loop_boundaries.end; + i += team_size) { + lambda(i, tmp2); + } + + // FIXME_OPENMPTARGET: Join should work but doesn't. Every threads gets a + // private TeamThread_scratch[0] and at the end of the for-loop the `join` + // operation is performed by OpenMP itself and hence the simple assignment + // works. + // result.join(TeamThread_scratch[0], tmp2); + TeamThread_scratch[0] = tmp2; + } + + result.reference() = TeamThread_scratch[0]; +} +#endif // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread + * and a reduction of val is performed using JoinType(ValueType& val, const + * ValueType& update) and put into init_result. The input value of init_result + * is used as initializer for temporary variables of ValueType. Therefore the + * input value should be the neutral element with respect to the join operation + * (e.g. '0 for +-' or '1 for *'). + */ +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, const JoinType& join, ValueType& init_result) { + ValueType* TeamThread_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + // FIXME_OPENMPTARGET: Still need to figure out how to get value_count here. + const int value_count = 1; + +#pragma omp barrier + TeamThread_scratch[0] = init_result; +#pragma omp barrier + +#pragma omp for + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, TeamThread_scratch[omp_get_num_threads() * value_count]); + } + + // Reduce all partial results within a team. + const int team_size = omp_get_num_threads(); + int tree_neighbor_offset = 1; + do { +#pragma omp for + for (int i = 0; i < team_size - tree_neighbor_offset; + i += 2 * tree_neighbor_offset) { + const int neighbor = i + tree_neighbor_offset; + join(lambda, &TeamThread_scratch[i * value_count], + &TeamThread_scratch[neighbor * value_count]); + } + tree_neighbor_offset *= 2; + } while (tree_neighbor_offset < team_size); + init_result = TeamThread_scratch[0]; +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread + * and a summation of val is performed and put into result. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + ValueType vector_reduce = ValueType(); + + if constexpr (std::is_arithmetic::value) { +#pragma omp simd reduction(+ : vector_reduce) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + vector_reduce += tmp; + } + } else { +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) + +#pragma omp simd reduction(custom : vector_reduce) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, vector_reduce); + } + } + + result = vector_reduce; +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value> +parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType const& result) { + using ValueType = typename ReducerType::value_type; + +#pragma omp declare reduction( \ + custom:ValueType \ + : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) + + ValueType vector_reduce; + Impl::OpenMPTargetReducerWrapper::init(vector_reduce); + +#pragma omp simd reduction(custom : vector_reduce) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, vector_reduce); + } + + result.reference() = vector_reduce; +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread + * and a reduction of val is performed using JoinType(ValueType& val, const + * ValueType& update) and put into init_result. The input value of init_result + * is used as initializer for temporary variables of ValueType. Therefore the + * input value should be the neutral element with respect to the join operation + * (e.g. '0 for +-' or '1 for *'). + */ +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, const JoinType& join, ValueType& init_result) { + ValueType result = init_result; + + // FIXME_OPENMPTARGET think about omp simd + // join does not work with omp reduction clause + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + join(result, tmp); + } + + init_result = result; +} + +/** \brief Intra-team vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling team + * and a summation of val is performed and put into result. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamVector_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp barrier + TeamVector_scratch[0] = ValueType(); +#pragma omp barrier + + if constexpr (std::is_arithmetic::value) { +#pragma omp for simd reduction(+ : TeamVector_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamVector_scratch[0] += tmp; + } + } else { +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) + +#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamVector_scratch[0] += tmp; + } + } + + result = TeamVector_scratch[0]; +} + +#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND) +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value> +parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType const& result) { + using ValueType = typename ReducerType::value_type; + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + +#pragma omp declare reduction( \ + custom:ValueType \ + : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) + + ValueType* TeamVector_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp barrier + Impl::OpenMPTargetReducerWrapper::init(TeamVector_scratch[0]); +#pragma omp barrier + +#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, TeamVector_scratch[0]); + } + + result.reference() = TeamVector_scratch[0]; +} +#else +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value> +parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType const& result) { + using ValueType = typename ReducerType::value_type; + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamVector_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp declare reduction( \ + omp_red_teamthread_reducer:ValueType \ + : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp barrier + ValueType tmp; + result.init(tmp); + TeamVector_scratch[0] = tmp; +#pragma omp barrier + + iType team_size = iType(omp_get_num_threads()); +#pragma omp for simd reduction(omp_red_teamthread_reducer \ + : TeamVector_scratch[:1]) schedule(static, 1) + for (iType t = 0; t < team_size; t++) { + ValueType tmp2; + result.init(tmp2); + + for (iType i = loop_boundaries.start + t; i < loop_boundaries.end; + i += team_size) { + lambda(i, tmp2); + } + TeamVector_scratch[0] = tmp2; + } + + result.reference() = TeamVector_scratch[0]; +} +#endif // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND + +namespace Impl { + +template +class ParallelReduce, + ReducerType, Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = + Kokkos::Impl::TeamPolicyInternal; + + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using ReducerTypeFwd = + std::conditional_t::value, + FunctorType, ReducerType>; + using WorkTagFwd = + std::conditional_t::value, WorkTag, + void>; + using Analysis = Impl::FunctorAnalysis; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + using value_type = typename Analysis::value_type; + + bool m_result_ptr_on_device; + const int m_result_ptr_num_elems; + + static constexpr int HasJoin = + Impl::FunctorAnalysis::has_join_member_function; + static constexpr int UseReducer = is_reducer::value; + static constexpr int IsArray = std::is_pointer::value; + + using ParReduceSpecialize = + ParallelReduceSpecialize; + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const size_t m_shmem_size; + + public: + void execute() const { + if constexpr (HasJoin) { + ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr, + m_result_ptr_on_device); + } else if constexpr (UseReducer) { + ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr, + m_result_ptr_on_device); + } else if constexpr (IsArray) { + if (m_result_ptr_num_elems <= 2) { + ParReduceSpecialize::template execute_array<2>( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 4) { + ParReduceSpecialize::template execute_array<4>( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 8) { + ParReduceSpecialize::template execute_array<8>( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 16) { + ParReduceSpecialize::template execute_array<16>( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 32) { + ParReduceSpecialize::template execute_array<32>( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else { + Kokkos::abort("array reduction length must be <= 32"); + } + } else { + ParReduceSpecialize::template execute_array<1>( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } + } + + template + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ViewType& arg_result, + std::enable_if_t::value && + !Kokkos::is_reducer::value, + void*> = nullptr) + : m_result_ptr_on_device( + MemorySpaceAccess::accessible), + m_result_ptr_num_elems(arg_result.size()), + m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize::value( + arg_functor, arg_policy.team_size())) {} + + ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, + const ReducerType& reducer) + : m_result_ptr_on_device( + MemorySpaceAccess::accessible), + m_result_ptr_num_elems(reducer.view().size()), + m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize::value( + arg_functor, arg_policy.team_size())) {} +}; + +} // namespace Impl + +#ifdef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND +#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND +#endif +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp new file mode 100644 index 0000000000..72eefe5683 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -0,0 +1,252 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLELSCAN_RANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLELSCAN_RANGE_HPP + +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelScan, + Kokkos::Experimental::OpenMPTarget> { + protected: + using Policy = Kokkos::RangePolicy; + + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + using idx_type = typename Policy::index_type; + + using Analysis = Impl::FunctorAnalysis; + + using value_type = typename Analysis::value_type; + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + + value_type* m_result_ptr; + const bool m_result_ptr_device_accessible; + + template + std::enable_if_t::value> call_with_tag( + const FunctorType& f, const idx_type& idx, value_type& val, + const bool& is_final) const { + f(idx, val, is_final); + } + template + std::enable_if_t::value> call_with_tag( + const FunctorType& f, const idx_type& idx, value_type& val, + const bool& is_final) const { + f(WorkTag(), idx, val, is_final); + } + + public: + void impl_execute( + Kokkos::View + element_values, + Kokkos::View + chunk_values, + Kokkos::View count) + const { + const idx_type N = m_policy.end() - m_policy.begin(); + const idx_type chunk_size = 128; + const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; + idx_type nteams = n_chunks > 512 ? 512 : n_chunks; + idx_type team_size = 128; + + FunctorType a_functor(m_functor); +#pragma omp target teams distribute map(to : a_functor) num_teams(nteams) + for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { + typename Analysis::Reducer final_reducer(&a_functor); +#pragma omp parallel num_threads(team_size) + { + const idx_type local_offset = team_id * chunk_size; + +#pragma omp for + for (idx_type i = 0; i < chunk_size; ++i) { + const idx_type idx = local_offset + i; + value_type val; + final_reducer.init(&val); + if (idx < N) call_with_tag(a_functor, idx, val, false); + element_values(team_id, i) = val; + } +#pragma omp barrier + if (omp_get_thread_num() == 0) { + value_type sum; + final_reducer.init(&sum); + for (idx_type i = 0; i < chunk_size; ++i) { + final_reducer.join(&sum, &element_values(team_id, i)); + element_values(team_id, i) = sum; + } + chunk_values(team_id) = sum; + } +#pragma omp barrier + if (omp_get_thread_num() == 0) { + if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) { + value_type sum; + final_reducer.init(&sum); + for (idx_type i = 0; i < n_chunks; ++i) { + final_reducer.join(&sum, &chunk_values(i)); + chunk_values(i) = sum; + } + } + } + } + } + +#pragma omp target teams distribute map(to \ + : a_functor) num_teams(nteams) \ + thread_limit(team_size) + for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { + typename Analysis::Reducer final_reducer(&a_functor); +#pragma omp parallel num_threads(team_size) + { + const idx_type local_offset = team_id * chunk_size; + value_type offset_value; + if (team_id > 0) + offset_value = chunk_values(team_id - 1); + else + final_reducer.init(&offset_value); + +#pragma omp for + for (idx_type i = 0; i < chunk_size; ++i) { + const idx_type idx = local_offset + i; + value_type local_offset_value; + if (i > 0) { + local_offset_value = element_values(team_id, i - 1); + // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs +#ifdef KOKKOS_ARCH_VEGA + if constexpr (Analysis::has_join_member_function) { + if constexpr (std::is_void_v) + a_functor.join(local_offset_value, offset_value); + else + a_functor.join(WorkTag{}, local_offset_value, offset_value); + } else + local_offset_value += offset_value; +#else + final_reducer.join(&local_offset_value, &offset_value); +#endif + } else + local_offset_value = offset_value; + if (idx < N) + call_with_tag(a_functor, idx, local_offset_value, true); + if (idx == N - 1 && m_result_ptr_device_accessible) + *m_result_ptr = local_offset_value; + } + } + } + } + + void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const idx_type N = m_policy.end() - m_policy.begin(); + const idx_type chunk_size = 128; + const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; + + // This could be scratch memory per team + Kokkos::View + element_values("element_values", n_chunks, chunk_size); + Kokkos::View + chunk_values("chunk_values", n_chunks); + Kokkos::View count( + "Count"); + + impl_execute(element_values, chunk_values, count); + } + + //---------------------------------------- + + ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy, + pointer_type arg_result_ptr = nullptr, + bool arg_result_ptr_device_accessible = false) + : m_functor(arg_functor), + m_policy(arg_policy), + m_result_ptr(arg_result_ptr), + m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} + + //---------------------------------------- +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template +class ParallelScanWithTotal, + ReturnType, Kokkos::Experimental::OpenMPTarget> + : public ParallelScan, + Kokkos::Experimental::OpenMPTarget> { + using base_t = ParallelScan, + Kokkos::Experimental::OpenMPTarget>; + using value_type = typename base_t::value_type; + + public: + void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); + const int chunk_size = 128; + const int64_t n_chunks = (N + chunk_size - 1) / chunk_size; + + if (N > 0) { + // This could be scratch memory per team + Kokkos::View + element_values("element_values", n_chunks, chunk_size); + Kokkos::View + chunk_values("chunk_values", n_chunks); + Kokkos::View count( + "Count"); + + base_t::impl_execute(element_values, chunk_values, count); + + if (!base_t::m_result_ptr_device_accessible) { + const int size = base_t::Analysis::value_size(base_t::m_functor); + DeepCopy( + base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size); + } + } else if (!base_t::m_result_ptr_device_accessible) { + *base_t::m_result_ptr = 0; + } + } + + template + ParallelScanWithTotal(const FunctorType& arg_functor, + const typename base_t::Policy& arg_policy, + const ViewType& arg_result_view) + : base_t(arg_functor, arg_policy, arg_result_view.data(), + MemorySpaceAccess::accessible) { + } +}; +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp new file mode 100644 index 0000000000..65002c1830 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp @@ -0,0 +1,129 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLELSCAN_TEAM_HPP +#define KOKKOS_OPENMPTARGET_PARALLELSCAN_TEAM_HPP + +#include +#include +#include +#include + +// FIXME_OPENMPTARGET - Using this macro to implement a workaround for +// hierarchical scan. It avoids hitting the code path which we wanted to +// write but doesn't work. undef'ed at the end. +#ifndef KOKKOS_ARCH_INTEL_GPU +#define KOKKOS_IMPL_TEAM_SCAN_WORKAROUND +#endif + +namespace Kokkos { + +// This is largely the same code as in HIP and CUDA except for the member name +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds, + const FunctorType& lambda) { + using Analysis = Impl::FunctorAnalysis, + FunctorType>; + using value_type = typename Analysis::value_type; + + const auto start = loop_bounds.start; + const auto end = loop_bounds.end; + // Note this thing is called .member in the CUDA specialization of + // TeamThreadRangeBoundariesStruct + auto& member = loop_bounds.team; + const auto team_rank = member.team_rank(); + +#if defined(KOKKOS_IMPL_TEAM_SCAN_WORKAROUND) + value_type scan_val = value_type(); + + if (team_rank == 0) { + for (iType i = start; i < end; ++i) { + lambda(i, scan_val, true); + } + } +#pragma omp barrier +#else + const auto team_size = member.team_size(); + const auto nchunk = (end - start + team_size - 1) / team_size; + value_type accum = 0; + // each team has to process one or + // more chunks of the prefix scan + for (iType i = 0; i < nchunk; ++i) { + auto ii = start + i * team_size + team_rank; + // local accumulation for this chunk + value_type local_accum = 0; + // user updates value with prefix value + if (ii < loop_bounds.end) lambda(ii, local_accum, false); + // perform team scan + local_accum = member.team_scan(local_accum); + // add this blocks accum to total accumulation + auto val = accum + local_accum; + // user updates their data with total accumulation + if (ii < loop_bounds.end) lambda(ii, val, true); + // the last value needs to be propogated to next chunk + if (team_rank == team_size - 1) accum = val; + // broadcast last value to rest of the team + member.team_broadcast(accum, team_size - 1); + } +#endif +} + +} // namespace Kokkos + +namespace Kokkos { + +/** \brief Intra-thread vector parallel exclusive prefix sum. Executes + * lambda(iType i, ValueType & val, bool final) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan + * operation is performed. Depending on the target execution space the operator + * might be called twice: once with final=false and once with final=true. When + * final==true val contains the prefix sum value. The contribution of this "i" + * needs to be added to val no matter whether final==true or not. In a serial + * execution (i.e. team_size==1) the operator is only called once with + * final==true. Scan_val will be set to the final sum value over all vector + * lanes. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const FunctorType& lambda) { + using Analysis = Impl::FunctorAnalysis, + FunctorType>; + using value_type = typename Analysis::value_type; + + value_type scan_val = value_type(); + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) { + lambda(i, scan_val, true); + } +} + +} // namespace Kokkos + +#ifdef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND +#undef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND +#endif + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp new file mode 100644 index 0000000000..75b17b7235 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp @@ -0,0 +1,675 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLEL_COMMON_HPP +#define KOKKOS_OPENMPTARGET_PARALLEL_COMMON_HPP + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +// This class has the memcpy routine that is commonly used by ParallelReduce +// over RangePolicy and TeamPolicy. +template +struct ParallelReduceCopy { + // Copy the result back to device if the view is on the device. + static void memcpy_result(PointerType dest, PointerType src, size_t size, + bool ptr_on_device) { + if (ptr_on_device) { + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0, + omp_get_default_device(), + omp_get_initial_device())); + } else { + *dest = *src; + } + } +}; + +// template +template +struct ParallelReduceSpecialize { + inline static void execute(const FunctorType& /*f*/, const PolicyType& /*p*/, + PointerType /*result_ptr*/) { + constexpr int FunctorHasJoin = + Impl::FunctorAnalysis::has_join_member_function; + constexpr int UseReducerType = is_reducer::value; + + std::stringstream error_message; + error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' ' + << UseReducerType << '\n'; + // FIXME_OPENMPTARGET + OpenMPTarget_abort(error_message.str().c_str()); + } +}; + +template +struct ParallelReduceSpecialize, + ReducerType, PointerType, ValueType> { + using PolicyType = Kokkos::RangePolicy; + using TagType = typename PolicyType::work_tag; + using ReducerTypeFwd = + std::conditional_t::value, + FunctorType, ReducerType>; + using Analysis = Impl::FunctorAnalysis; + using ReferenceType = typename Analysis::reference_type; + + using ParReduceCopy = ParallelReduceCopy; + + static void execute_reducer(const FunctorType& f, const PolicyType& p, + PointerType result_ptr, bool ptr_on_device) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const auto begin = p.begin(); + const auto end = p.end(); + + ValueType result; + OpenMPTargetReducerWrapper::init(result); + + // Initialize and copy back the result even if it is a zero length + // reduction. + if (end <= begin) { + ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), + ptr_on_device); + return; + } + +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for map(to \ + : f) reduction(custom \ + : result) + for (auto i = begin; i < end; ++i) { + if constexpr (std::is_void::value) { + f(i, result); + } else { + f(TagType(), i, result); + } + } + + ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), + ptr_on_device); + } + + template + static void execute_array(const FunctorType& f, const PolicyType& p, + PointerType result_ptr, bool ptr_on_device) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const auto begin = p.begin(); + const auto end = p.end(); + + // Enter the loop if the reduction is on a scalar type. + if constexpr (NumReductions == 1) { + ValueType result = ValueType(); + + // Initialize and copy back the result even if it is a zero length + // reduction. + if (end <= begin) { + ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), + ptr_on_device); + return; + } + // Case where reduction is on a native data type. + if constexpr (std::is_arithmetic::value) { +#pragma omp target teams distribute parallel for \ + map(to:f) reduction(+: result) + for (auto i = begin; i < end; ++i) + + if constexpr (std::is_void::value) { + f(i, result); + } else { + f(TagType(), i, result); + } + } else { +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) +#pragma omp target teams distribute parallel for map(to \ + : f) reduction(custom \ + : result) + for (auto i = begin; i < end; ++i) + + if constexpr (std::is_void::value) { + f(i, result); + } else { + f(TagType(), i, result); + } + } + + ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), + ptr_on_device); + } else { + ValueType result[NumReductions] = {}; + + // Initialize and copy back the result even if it is a zero length + // reduction. + if (end <= begin) { + ParReduceCopy::memcpy_result(result_ptr, result, + NumReductions * sizeof(ValueType), + ptr_on_device); + return; + } +#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions]) + for (auto i = begin; i < end; ++i) { + if constexpr (std::is_void::value) { + f(i, result); + } else { + f(TagType(), i, result); + } + } + + ParReduceCopy::memcpy_result( + result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device); + } + } + + static void execute_init_join(const FunctorType& f, const PolicyType& p, + PointerType ptr, const bool ptr_on_device) { + const auto begin = p.begin(); + const auto end = p.end(); + + using FunctorAnalysis = + Impl::FunctorAnalysis; + constexpr int HasInit = FunctorAnalysis::has_init_member_function; + + // Initialize the result pointer. + + const auto size = end - begin; + + // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently + // based on NVIDIA-V100 and should be modifid to be based on the + // architecture in the future. + const int max_team_threads = 32; + const int max_teams = + OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads; + // Number of elements in the reduction + const auto value_count = FunctorAnalysis::value_count(f); + + // Allocate scratch per active thread. Achieved by setting the first + // parameter of `resize_scratch=1`. + OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), + std::numeric_limits::max()); + ValueType* scratch_ptr = + static_cast(OpenMPTargetExec::get_scratch_ptr()); + +#pragma omp target map(to : f) is_device_ptr(scratch_ptr) + { + typename FunctorAnalysis::Reducer final_reducer(&f); + // Enter this loop if the functor has an `init` + if constexpr (HasInit) { + // The `init` routine needs to be called on the device since it might + // need device members. + final_reducer.init(scratch_ptr); + final_reducer.final(scratch_ptr); + } else { + for (int i = 0; i < value_count; ++i) { + static_cast(scratch_ptr)[i] = ValueType(); + } + + final_reducer.final(scratch_ptr); + } + } + + if (end <= begin) { + // If there is no work to be done, copy back the initialized values and + // exit. + if (!ptr_on_device) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_initial_device(), omp_get_default_device())); + else + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_default_device(), omp_get_default_device())); + + return; + } + +#pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \ + map(to \ + : f) is_device_ptr(scratch_ptr) + { + typename FunctorAnalysis::Reducer final_reducer(&f); +#pragma omp parallel + { + const int team_num = omp_get_team_num(); + const int num_teams = omp_get_num_teams(); + const auto chunk_size = size / num_teams; + const auto team_begin = begin + team_num * chunk_size; + const auto team_end = + (team_num == num_teams - 1) ? end : (team_begin + chunk_size); + ValueType* team_scratch = + scratch_ptr + team_num * max_team_threads * value_count; + ReferenceType result = final_reducer.init( + &team_scratch[omp_get_thread_num() * value_count]); + + // Accumulate partial results in thread specific storage. +#pragma omp for simd + for (auto i = team_begin; i < team_end; ++i) { + if constexpr (std::is_void::value) { + f(i, result); + } else { + f(TagType(), i, result); + } + } + + // Reduce all paritial results within a team. + const int team_size = max_team_threads; + int tree_neighbor_offset = 1; + do { +#pragma omp for simd + for (int i = 0; i < team_size - tree_neighbor_offset; + i += 2 * tree_neighbor_offset) { + const int neighbor = i + tree_neighbor_offset; + final_reducer.join(&team_scratch[i * value_count], + &team_scratch[neighbor * value_count]); + } + tree_neighbor_offset *= 2; + } while (tree_neighbor_offset < team_size); + } // end parallel + } // end target + + int tree_neighbor_offset = 1; + do { +#pragma omp target teams distribute parallel for simd map(to \ + : f) \ + is_device_ptr(scratch_ptr) + for (int i = 0; i < max_teams - tree_neighbor_offset; + i += 2 * tree_neighbor_offset) { + typename FunctorAnalysis::Reducer final_reducer(&f); + ValueType* team_scratch = scratch_ptr; + const int team_offset = max_team_threads * value_count; + final_reducer.join( + &team_scratch[i * team_offset], + &team_scratch[(i + tree_neighbor_offset) * team_offset]); + + // If `final` is provided by the functor. + // Do the final only once at the end. + if (tree_neighbor_offset * 2 >= max_teams && omp_get_team_num() == 0 && + omp_get_thread_num() == 0) { + final_reducer.final(scratch_ptr); + } + } + tree_neighbor_offset *= 2; + } while (tree_neighbor_offset < max_teams); + + // If the result view is on the host, copy back the values via memcpy. + if (!ptr_on_device) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_initial_device(), omp_get_default_device())); + else + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_default_device(), omp_get_default_device())); + } +}; + +template +struct ParallelReduceSpecialize, + ReducerType, PointerType, ValueType> { + using PolicyType = TeamPolicyInternal; + using TagType = typename PolicyType::work_tag; + using ReducerTypeFwd = + std::conditional_t::value, + FunctorType, ReducerType>; + using Analysis = Impl::FunctorAnalysis; + + using ReferenceType = typename Analysis::reference_type; + + using ParReduceCopy = ParallelReduceCopy; + + static void execute_reducer(const FunctorType& f, const PolicyType& p, + PointerType result_ptr, bool ptr_on_device) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + + const int league_size = p.league_size(); + const int team_size = p.team_size(); + const int vector_length = p.impl_vector_length(); + + const size_t shmem_size_L0 = p.scratch_size(0, team_size); + const size_t shmem_size_L1 = p.scratch_size(1, team_size); + OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, + shmem_size_L0, shmem_size_L1, league_size); + void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + + ValueType result = ValueType(); + + // Maximum active teams possible. + int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; + const auto nteams = + league_size < max_active_teams ? league_size : max_active_teams; + + // If the league size is <=0, do not launch the kernel. + if (nteams <= 0) return; + +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ + : f) \ + is_device_ptr(scratch_ptr) reduction(custom \ + : result) +#pragma omp parallel reduction(custom : result) + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void::value) + f(team, result); + else + f(TagType(), team, result); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + + // Copy results back to device if `parallel_reduce` is on a device view. + ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), + ptr_on_device); + } + + template + static void execute_array(const FunctorType& f, const PolicyType& p, + PointerType result_ptr, bool ptr_on_device) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + + const int league_size = p.league_size(); + const int team_size = p.team_size(); + const int vector_length = p.impl_vector_length(); + + const size_t shmem_size_L0 = p.scratch_size(0, team_size); + const size_t shmem_size_L1 = p.scratch_size(1, team_size); + OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, + shmem_size_L0, shmem_size_L1, league_size); + void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + + // Maximum active teams possible. + int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; + const auto nteams = + league_size < max_active_teams ? league_size : max_active_teams; + + // If the league size is <=0, do not launch the kernel. + if (nteams <= 0) return; + + // Case where the number of reduction items is 1. + if constexpr (NumReductions == 1) { + ValueType result = ValueType(); + + // Case where reduction is on a native data type. + if constexpr (std::is_arithmetic::value) { +#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ + : f) \ + is_device_ptr(scratch_ptr) reduction(+: result) +#pragma omp parallel reduction(+ : result) + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void::value) + f(team, result); + else + f(TagType(), team, result); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + } else { + // Case where the reduction is on a non-native data type. +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) +#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ + : f) \ + is_device_ptr(scratch_ptr) reduction(custom \ + : result) +#pragma omp parallel reduction(custom : result) + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void::value) + f(team, result); + else + f(TagType(), team, result); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + } + + // Copy results back to device if `parallel_reduce` is on a device view. + ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), + ptr_on_device); + } else { + ValueType result[NumReductions] = {}; + // Case where the reduction is on an array. +#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ + : f) \ + is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions]) +#pragma omp parallel reduction(+ : result[:NumReductions]) + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void::value) + f(team, result); + else + f(TagType(), team, result); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + + // Copy results back to device if `parallel_reduce` is on a device view. + ParReduceCopy::memcpy_result( + result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device); + } + } + + // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over + // RangePolicy. Need a new implementation. + static void execute_init_join(const FunctorType& f, const PolicyType& p, + PointerType ptr, const bool ptr_on_device) { + using FunctorAnalysis = + Impl::FunctorAnalysis; + constexpr int HasInit = FunctorAnalysis::has_init_member_function; + + const int league_size = p.league_size(); + const int team_size = p.team_size(); + const int vector_length = p.impl_vector_length(); + + auto begin = 0; + auto end = league_size * team_size + team_size * vector_length; + + const size_t shmem_size_L0 = p.scratch_size(0, team_size); + const size_t shmem_size_L1 = p.scratch_size(1, team_size); + + // FIXME_OPENMPTARGET: This would oversubscribe scratch memory since we are + // already using the available scratch memory to create temporaries for each + // thread. + if ((shmem_size_L0 + shmem_size_L1) > 0) { + Kokkos::abort( + "OpenMPTarget: Scratch memory is not supported in `parallel_reduce` " + "over functors with init/join."); + } + + const auto nteams = league_size; + + // Number of elements in the reduction + const auto value_count = FunctorAnalysis::value_count(f); + + // Allocate scratch per active thread. + OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), + league_size); + void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + + // Enter this loop if the functor has an `init` + if constexpr (HasInit) { + // The `init` routine needs to be called on the device since it might need + // device members. +#pragma omp target map(to : f) is_device_ptr(scratch_ptr) + { + typename FunctorAnalysis::Reducer final_reducer(&f); + final_reducer.init(scratch_ptr); + final_reducer.final(scratch_ptr); + } + } else { +#pragma omp target map(to : f) is_device_ptr(scratch_ptr) + { + for (int i = 0; i < value_count; ++i) { + static_cast(scratch_ptr)[i] = ValueType(); + } + + typename FunctorAnalysis::Reducer final_reducer(&f); + final_reducer.final(static_cast(scratch_ptr)); + } + } + + if (end <= begin) { + // If there is no work to be done, copy back the initialized values and + // exit. + if (!ptr_on_device) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_initial_device(), omp_get_default_device())); + else + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_default_device(), omp_get_default_device())); + + return; + } + +#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ + : f) \ + is_device_ptr(scratch_ptr) + { +#pragma omp parallel + { + const int team_num = omp_get_team_num(); + const int num_teams = omp_get_num_teams(); + ValueType* team_scratch = static_cast(scratch_ptr) + + team_num * team_size * value_count; + typename FunctorAnalysis::Reducer final_reducer(&f); + ReferenceType result = final_reducer.init(&team_scratch[0]); + + for (int league_id = team_num; league_id < league_size; + league_id += num_teams) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + team_num, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void::value) { + f(team, result); + } else { + f(TagType(), team, result); + } + } + } // end parallel + } // end target + + int tree_neighbor_offset = 1; + do { +#pragma omp target teams distribute parallel for simd map(to \ + : f) \ + is_device_ptr(scratch_ptr) + for (int i = 0; i < nteams - tree_neighbor_offset; + i += 2 * tree_neighbor_offset) { + ValueType* team_scratch = static_cast(scratch_ptr); + const int team_offset = team_size * value_count; + typename FunctorAnalysis::Reducer final_reducer(&f); + final_reducer.join( + &team_scratch[i * team_offset], + &team_scratch[(i + tree_neighbor_offset) * team_offset]); + + // If `final` is provided by the functor. + // Do the final only once at the end. + if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 && + omp_get_thread_num() == 0) { + final_reducer.final(scratch_ptr); + } + } + tree_neighbor_offset *= 2; + } while (tree_neighbor_offset < nteams); + + // If the result view is on the host, copy back the values via memcpy. + if (!ptr_on_device) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_initial_device(), omp_get_default_device())); + else + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_default_device(), omp_get_default_device())); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp index 21bdb67e34..251ca20b44 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp @@ -19,7 +19,8 @@ #include #include -#include +#include +#include // WORKAROUND OPENMPTARGET: sometimes tile sizes don't make it correctly, // this was tracked down to a bug in clang with regards of mapping structs @@ -437,7 +438,7 @@ class ParallelReduce, ReducerType, const Policy m_policy; const ReducerType m_reducer; - using ParReduceCommon = ParallelReduceCommon; + using ParReduceCopy = ParallelReduceCopy; bool m_result_ptr_on_device; @@ -518,8 +519,8 @@ reduction(+:result) } } - ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } template @@ -573,8 +574,8 @@ reduction(+:result) } } - ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } template @@ -636,8 +637,8 @@ reduction(+:result) } } - ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } template @@ -707,8 +708,8 @@ reduction(+:result) } } - ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } template @@ -784,8 +785,8 @@ reduction(+:result) } } - ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } template diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp new file mode 100644 index 0000000000..672271ed6b --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp @@ -0,0 +1,694 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGETREDUCER_HPP +#define KOKKOS_OPENMPTARGETREDUCER_HPP + +#include +#include + +#include +#include "Kokkos_OpenMPTarget_Abort.hpp" + +namespace Kokkos { +namespace Impl { + +template +struct OpenMPTargetReducerWrapper { + using value_type = typename Reducer::value_type; + + // Using a generic unknown Reducer for the OpenMPTarget backend is not + // implemented. + KOKKOS_INLINE_FUNCTION + static void join(value_type&, const value_type&) = delete; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type&, const volatile value_type&) = delete; + + KOKKOS_INLINE_FUNCTION + static void init(value_type&) = delete; +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { dest += src; } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest += src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::sum(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { dest *= src; } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest *= src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::prod(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src < dest) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src < dest) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::min(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src > dest) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src > dest) dest = src; + } + + // Required + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::max(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest = dest && src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest = dest && src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::land(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + using result_view_type = Kokkos::View; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest = dest || src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest = dest || src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::lor(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest = dest & src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest = dest & src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::band(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest = dest | src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest = dest | src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::bor(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = ValLocScalar; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.val < dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.val < dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.val = reduction_identity::min(); + val.loc = reduction_identity::min(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = ValLocScalar; + + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.val > dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.val > dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.val = reduction_identity::max(); + val.loc = reduction_identity::min(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + + public: + // Required + using value_type = MinMaxScalar; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + } + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + } + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.max_val = reduction_identity::max(); + val.min_val = reduction_identity::min(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = MinMaxLocScalar; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.max_val = reduction_identity::max(); + val.min_val = reduction_identity::min(); + val.max_loc = reduction_identity::min(); + val.min_loc = reduction_identity::min(); + } +}; + +// +// specialize for MaxFirstLoc +// +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = ValLocScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (dest.val < src.val) { + dest = src; + } else if (!(src.val < dest.val)) { + dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (dest.val < src.val) { + dest = src; + } else if (!(src.val < dest.val)) { + dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.val = reduction_identity::max(); + val.loc = reduction_identity::min(); + } +#pragma omp end declare target +}; + +// +// specialize for MinFirstLoc +// +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = ValLocScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.val < dest.val) { + dest = src; + } else if (!(dest.val < src.val)) { + dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.val < dest.val) { + dest = src; + } else if (!(dest.val < src.val)) { + dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.val = reduction_identity::min(); + val.loc = reduction_identity::min(); + } +#pragma omp end declare target +}; + +// +// specialize for MinMaxFirstLastLoc +// +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = MinMaxLocScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } else if (!(dest.min_val < src.min_val)) { + dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; + } + + if (dest.max_val < src.max_val) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } else if (!(src.max_val < dest.max_val)) { + dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } else if (!(dest.min_val < src.min_val)) { + dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; + } + + if (dest.max_val < src.max_val) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } else if (!(src.max_val < dest.max_val)) { + dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.max_val = reduction_identity::max(); + val.min_val = reduction_identity::min(); + val.max_loc = reduction_identity::max(); + val.min_loc = reduction_identity::min(); + } +#pragma omp end declare target +}; + +// +// specialize for FirstLoc +// +template +struct OpenMPTargetReducerWrapper> { + private: + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = FirstLocScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) + ? src.min_loc_true + : dest.min_loc_true; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) + ? src.min_loc_true + : dest.min_loc_true; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.min_loc_true = reduction_identity::min(); + } +#pragma omp end declare target +}; + +// +// specialize for LastLoc +// +template +struct OpenMPTargetReducerWrapper> { + private: + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = LastLocScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) + ? src.max_loc_true + : dest.max_loc_true; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) + ? src.max_loc_true + : dest.max_loc_true; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.max_loc_true = reduction_identity::max(); + } +#pragma omp end declare target +}; + +// +// specialize for StdIsPartitioned +// +template +struct OpenMPTargetReducerWrapper> { + private: + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = StdIsPartScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) + ? src.max_loc_true + : dest.max_loc_true; + + dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) + ? dest.min_loc_false + : src.min_loc_false; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) + ? src.max_loc_true + : dest.max_loc_true; + + dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) + ? dest.min_loc_false + : src.min_loc_false; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.max_loc_true = ::Kokkos::reduction_identity::max(); + val.min_loc_false = ::Kokkos::reduction_identity::min(); + } +#pragma omp end declare target +}; + +// +// specialize for StdPartitionPoint +// +template +struct OpenMPTargetReducerWrapper> { + private: + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = StdPartPointScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) + ? dest.min_loc_false + : src.min_loc_false; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) + ? dest.min_loc_false + : src.min_loc_false; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.min_loc_false = ::Kokkos::reduction_identity::min(); + } +#pragma omp end declare target +}; + +/* +template +class OpenMPTargetReducerWrapper { + public: + const ReducerType& reducer; + using value_type = typename ReducerType::value_type; + value_type& value; + + KOKKOS_INLINE_FUNCTION + void join(const value_type& upd) { + reducer.join(value,upd); + } + + KOKKOS_INLINE_FUNCTION + void init(const value_type& upd) { + reducer.init(value,upd); + } +};*/ + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp index c7f146871b..d9ea555055 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp @@ -20,7 +20,7 @@ #include #ifdef KOKKOS_ENABLE_OPENMPTARGET -#include +#include #include #include #include diff --git a/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp b/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp index 0bd89ef4cf..6bde8f59d8 100644 --- a/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp +++ b/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp @@ -18,10 +18,17 @@ #define KOKKOS_DECLARE_OPENMPTARGET_HPP #if defined(KOKKOS_ENABLE_OPENMPTARGET) -#include -#include +#include +#include +#include #include #include +#include +#include +#include +#include +#include +#include #endif #endif From c10edf35f8ab06088f8014bdbebf24793ea55ea3 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Feb 2023 14:23:57 -0500 Subject: [PATCH 143/496] Skip Tpetra reproducer with NVHPC compiler --- core/unit_test/TestAtomics.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/core/unit_test/TestAtomics.hpp b/core/unit_test/TestAtomics.hpp index 4491893c4c..4c3f2f0c1c 100644 --- a/core/unit_test/TestAtomics.hpp +++ b/core/unit_test/TestAtomics.hpp @@ -578,6 +578,12 @@ struct TpetraUseCase { } }; -TEST(TEST_CATEGORY, atomics_tpetra_max_abs) { TpetraUseCase().check(); } +TEST(TEST_CATEGORY, atomics_tpetra_max_abs) { +#ifdef KOKKOS_COMPILER_NVHPC + GTEST_SKIP() << "FIXME_NVHPC (?)"; +#endif + + TpetraUseCase().check(); +} } // namespace Test From b8603a749be8834684b9e56b7ed657565eae6cd6 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Feb 2023 14:29:19 -0500 Subject: [PATCH 144/496] Fixup typo `#ifdef KOKKOS_ENABLE_DEPRECA{R -> T}ED_CODE_3` --- core/src/Kokkos_Macros.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index f79713eb37..7e9e23b64f 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -451,7 +451,7 @@ //---------------------------------------------------------------------------- // Determine for what space the code is being compiled: -#if defined(KOKKOS_ENABLE_DEPRECARED_CODE_3) +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3) #if defined(__CUDACC__) && defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA From ccbfb0086f6669bf44e4ecbd08d73b15f650bcaf Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 1 Feb 2023 16:05:04 -0500 Subject: [PATCH 145/496] Set native flags according to CMAKE_SYSTEM_PROCESSOR (#5831) * Add ARMClang and set native flags according to CMAKE_SYSTEM_PROCESSOR * Compare uppercase CMAKE_SYSTEM_PROCESSOR Co-authored-by: Damien L-G --------- Co-authored-by: Damien L-G --- cmake/kokkos_arch.cmake | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index b60215e60a..f05b22079f 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -229,9 +229,15 @@ IF(KOKKOS_ARCH_NATIVE) MESSAGE(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") ENDIF() + STRING(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) + IF(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") + SET(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") + ELSE() + SET(KOKKOS_NATIVE_FLAGS "-mcpu=native") + ENDIF() COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - DEFAULT -march=native -mtune=native + DEFAULT ${KOKKOS_NATIVE_FLAGS} ) ENDIF() From 93487cf1e87149d76bc217d6f6f97275fbd41edd Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Feb 2023 16:07:46 -0500 Subject: [PATCH 146/496] Fix flag passed to NVHPC when `Kokkos_ARCH_NATIVE` is `ON` --- cmake/kokkos_arch.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index f05b22079f..c33eccb319 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -237,6 +237,7 @@ IF(KOKKOS_ARCH_NATIVE) ENDIF() COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + NVHPC -tp=native DEFAULT ${KOKKOS_NATIVE_FLAGS} ) ENDIF() From 23e2d85b06bf411aa609c24abf1715f8f91efddc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Sun, 29 Jan 2023 19:49:06 -0500 Subject: [PATCH 147/496] Desul atomics: conditionally append the CUDA/HIP/SYCL source files --- core/src/CMakeLists.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index 0f9de74707..0be3d71682 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -89,7 +89,13 @@ IF (KOKKOS_ENABLE_SYCL) ENDIF() IF (NOT desul_FOUND) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/*.cpp) + IF (KOKKOS_ENABLE_CUDA) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_CUDA.cpp) + ELSEIF (KOKKOS_ENABLE_HIP) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_HIP.cpp) + ELSEIF (KOKKOS_ENABLE_SYCL) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_SYCL.cpp) + ENDIF() APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*.hpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*.hpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*/*.hpp) From 37bcd4129917e1f60ae92e90da5ea3aa50390c0e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Feb 2023 16:23:51 -0500 Subject: [PATCH 148/496] Desul atomics: cleanup macro guards in CUDA/HIP lock guard files --- .../include/desul/atomics/Lock_Array_CUDA.hpp | 25 +++---------------- .../include/desul/atomics/Lock_Array_HIP.hpp | 13 +--------- tpls/desul/src/Lock_Array_CUDA.cpp | 2 -- tpls/desul/src/Lock_Array_HIP.cpp | 2 -- 4 files changed, 4 insertions(+), 38 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp b/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp index e514061ed0..4ff7196eed 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp @@ -9,13 +9,11 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ #define DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ +#include + #include "desul/atomics/Common.hpp" #include "desul/atomics/Macros.hpp" -#ifdef DESUL_HAVE_CUDA_ATOMICS - -#include - namespace desul { namespace Impl { @@ -42,14 +40,6 @@ void init_lock_arrays_cuda(); template void finalize_lock_arrays_cuda(); -} // namespace Impl -} // namespace desul - -#if defined(__CUDACC__) - -namespace desul { -namespace Impl { - /// \brief This global variable in CUDA space is what kernels use /// to get access to the lock arrays. /// @@ -118,12 +108,7 @@ __device__ inline void unlock_address_cuda(void* ptr, desul::MemoryScopeNode) { atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[offset], 0); } -} // namespace Impl -} // namespace desul - // Make lock_array_copied an explicit translation unit scope thingy -namespace desul { -namespace Impl { namespace { static int lock_array_copied = 0; } // namespace @@ -149,13 +134,9 @@ inline static } // namespace Impl } // namespace desul -#endif /* defined( __CUDACC__ ) */ - -#endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */ - namespace desul { -#if defined(__CUDACC_RDC__) || (!defined(__CUDACC__)) +#if defined(__CUDACC_RDC__) inline void ensure_cuda_lock_arrays_on_device() {} #else static inline void ensure_cuda_lock_arrays_on_device() { diff --git a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp index 33450e32ec..9290aea2b3 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp @@ -43,12 +43,6 @@ void init_lock_arrays_hip(); /// snapshotted version while also linking against pure Desul template void finalize_lock_arrays_hip(); -} // namespace Impl -} // namespace desul - -#ifdef __HIPCC__ -namespace desul { -namespace Impl { /** * \brief This global variable in HIP space is what kernels use to get access @@ -120,13 +114,8 @@ __device__ inline void unlock_address_hip(void* ptr, desul::MemoryScopeNode) { offset = offset & HIP_SPACE_ATOMIC_MASK; atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[offset], 0); } -#endif -} // namespace Impl -} // namespace desul // Make lock_array_copied an explicit translation unit scope thing -namespace desul { -namespace Impl { namespace { static int lock_array_copied = 0; } // namespace @@ -150,7 +139,7 @@ inline static } } // namespace Impl -#if defined(DESUL_HIP_RDC) || (!defined(__HIPCC__)) +#if defined(DESUL_HIP_RDC) inline void ensure_hip_lock_arrays_on_device() {} #else static inline void ensure_hip_lock_arrays_on_device() { diff --git a/tpls/desul/src/Lock_Array_CUDA.cpp b/tpls/desul/src/Lock_Array_CUDA.cpp index 19944b378e..d8ab895b2b 100644 --- a/tpls/desul/src/Lock_Array_CUDA.cpp +++ b/tpls/desul/src/Lock_Array_CUDA.cpp @@ -11,7 +11,6 @@ SPDX-License-Identifier: (BSD-3-Clause) #include #include -#ifdef DESUL_HAVE_CUDA_ATOMICS #ifdef __CUDACC_RDC__ namespace desul { namespace Impl { @@ -96,4 +95,3 @@ template void finalize_lock_arrays_cuda(); } // namespace Impl } // namespace desul -#endif diff --git a/tpls/desul/src/Lock_Array_HIP.cpp b/tpls/desul/src/Lock_Array_HIP.cpp index 986f5475ae..6191fe81e2 100644 --- a/tpls/desul/src/Lock_Array_HIP.cpp +++ b/tpls/desul/src/Lock_Array_HIP.cpp @@ -11,7 +11,6 @@ SPDX-License-Identifier: (BSD-3-Clause) #include #include -#ifdef DESUL_HAVE_HIP_ATOMICS #ifdef DESUL_HIP_RDC namespace desul { namespace Impl { @@ -99,4 +98,3 @@ template void finalize_lock_arrays_hip(); } // namespace Impl } // namespace desul -#endif From 1d19328ed60cdaaa33b38eec71a6143caaab1ea8 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Feb 2023 16:25:15 -0500 Subject: [PATCH 149/496] Desul atomics: SYCL lock arrays out of sync --- .../include/desul/atomics/Lock_Array_SYCL.hpp | 13 +++++++++++ tpls/desul/src/Lock_Array_SYCL.cpp | 23 ++++++------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp b/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp index 8f42c6b37e..d0691de85b 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp @@ -128,13 +128,26 @@ inline void unlock_address_sycl(void* ptr, MemoryScopeNode) { lock_node_ref.exchange(0); } #else + +template +void init_lock_arrays_sycl(sycl::queue q) { + assert(false); +} + +template +void finalize_lock_arrays_sycl(sycl::queue q) { + assert(false); +} + inline bool lock_address_sycl(void*, MemoryScopeDevice) { assert(false); + // return true so that the CAS loops don't hang. return true; } inline bool lock_address_sycl(void*, MemoryScopeNode) { assert(false); + // return true so that the CAS loops don't hang. return true; } diff --git a/tpls/desul/src/Lock_Array_SYCL.cpp b/tpls/desul/src/Lock_Array_SYCL.cpp index 6bc9a890a8..9e84c60e41 100644 --- a/tpls/desul/src/Lock_Array_SYCL.cpp +++ b/tpls/desul/src/Lock_Array_SYCL.cpp @@ -12,23 +12,18 @@ SPDX-License-Identifier: (BSD-3-Clause) #include #include -namespace desul { -namespace Impl { +namespace desul::Impl { + SYCL_EXTERNAL sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_DEVICE; SYCL_EXTERNAL sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; -} // namespace Impl -} // namespace desul - -namespace desul { -namespace Impl { int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; -template -void init_lock_arrays_sycl(sycl::queue q) { +template <> +void init_lock_arrays_sycl(sycl::queue q) { if (SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return; SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = @@ -60,8 +55,8 @@ void init_lock_arrays_sycl(sycl::queue q) { q.wait_and_throw(); } -template -void finalize_lock_arrays_sycl(sycl::queue q) { +template <> +void finalize_lock_arrays_sycl(sycl::queue q) { if (SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return; sycl::free(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, q); @@ -70,9 +65,5 @@ void finalize_lock_arrays_sycl(sycl::queue q) { SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; } -template void init_lock_arrays_sycl(sycl::queue); -template void finalize_lock_arrays_sycl(sycl::queue); - -} // namespace Impl -} // namespace desul +} // namespace desul::Impl #endif From 43ccea6a576afac6716907919af1d37426427b68 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Feb 2023 16:26:28 -0500 Subject: [PATCH 150/496] Desul atomics: Drop `DESUL_HAVE_{GPU_LIKE,FORWARD}_PROGRESS` macros --- tpls/desul/include/desul/atomics/Macros.hpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Macros.hpp b/tpls/desul/include/desul/atomics/Macros.hpp index 992fb9fa66..5b4df2661e 100644 --- a/tpls/desul/include/desul/atomics/Macros.hpp +++ b/tpls/desul/include/desul/atomics/Macros.hpp @@ -39,12 +39,6 @@ SPDX-License-Identifier: (BSD-3-Clause) #define DESUL_HAVE_MSVC_ATOMICS #endif -#if (defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDA_ARCH__)) || \ - (defined(DESUL_ATOMICS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \ - (defined(DESUL_ATOMICS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)) -#define DESUL_HAVE_GPU_LIKE_PROGRESS -#endif - #if defined(DESUL_HAVE_CUDA_ATOMICS) || defined(DESUL_HAVE_HIP_ATOMICS) #define DESUL_FORCEINLINE_FUNCTION inline __host__ __device__ #define DESUL_INLINE_FUNCTION inline __host__ __device__ @@ -59,10 +53,6 @@ SPDX-License-Identifier: (BSD-3-Clause) #define DESUL_IMPL_DEVICE_FUNCTION #endif -#if !defined(DESUL_HAVE_GPU_LIKE_PROGRESS) -#define DESUL_HAVE_FORWARD_PROGRESS -#endif - #define DESUL_IMPL_STRIP_PARENS(X) DESUL_IMPL_ESC(DESUL_IMPL_ISH X) #define DESUL_IMPL_ISH(...) DESUL_IMPL_ISH __VA_ARGS__ #define DESUL_IMPL_ESC(...) DESUL_IMPL_ESC_(__VA_ARGS__) From 879d60798edd78e7151233714ecba0c7bade9c8d Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Thu, 2 Feb 2023 07:52:24 -0500 Subject: [PATCH 151/496] Make OpenMP::concurrency and impl_thread_pool_size non-static (#5836) * Make OpenMP::concurrency non-static * Make OpenMP::impl_thread_pool_size non-static * Remove useless inline * Use copy of object instead of reference --- core/src/OpenMP/Kokkos_OpenMP.cpp | 15 +++++---- core/src/OpenMP/Kokkos_OpenMP.hpp | 14 +++++--- core/src/OpenMP/Kokkos_OpenMP_Task.hpp | 5 +-- core/src/OpenMP/Kokkos_OpenMP_Team.hpp | 30 +++++++---------- core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp | 32 +++++++++++-------- .../OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp | 3 +- 6 files changed, 52 insertions(+), 47 deletions(-) diff --git a/core/src/OpenMP/Kokkos_OpenMP.cpp b/core/src/OpenMP/Kokkos_OpenMP.cpp index a35541257a..687f6e3c5d 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -70,9 +70,13 @@ void OpenMP::print_configuration(std::ostream &os, bool /*verbose*/) const { m_space_instance->print_configuration(os); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 int OpenMP::concurrency(OpenMP const &instance) { - return impl_thread_pool_size(instance); + return instance.impl_thread_pool_size(); } +#else +int OpenMP::concurrency() const { return impl_thread_pool_size(); } +#endif void OpenMP::fence(const std::string &name) const { Kokkos::Tools::Experimental::Impl::profile_fence_event( @@ -94,17 +98,16 @@ bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { #endif } -int OpenMP::impl_thread_pool_size(OpenMP const &exec_space) noexcept { +int OpenMP::impl_thread_pool_size() const noexcept { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 return OpenMP::in_parallel(exec_space) ? omp_get_num_threads() : (Impl::t_openmp_instance ? Impl::t_openmp_instance->m_pool_size - : exec_space.impl_internal_space_instance()->m_pool_size); + : impl_internal_space_instance()->m_pool_size); #else - return OpenMP::in_parallel(exec_space) - ? omp_get_num_threads() - : exec_space.impl_internal_space_instance()->m_pool_size; + return OpenMP::in_parallel() ? omp_get_num_threads() + : impl_internal_space_instance()->m_pool_size; #endif } diff --git a/core/src/OpenMP/Kokkos_OpenMP.hpp b/core/src/OpenMP/Kokkos_OpenMP.hpp index 897554f8f2..56937f32f5 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -116,7 +116,11 @@ class OpenMP { int requested_partition_size = 0); #endif +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(OpenMP const& = OpenMP()); +#else + int concurrency() const; +#endif static void impl_initialize(InitializationSettings const&); @@ -127,13 +131,13 @@ class OpenMP { /// \brief Free any resources being consumed by the default execution space static void impl_finalize(); - static int impl_thread_pool_size(OpenMP const& = OpenMP()) noexcept; + int impl_thread_pool_size() const noexcept; + + int impl_thread_pool_size(int depth) const; /** \brief The rank of the executing thread in this thread pool */ inline static int impl_thread_pool_rank() noexcept; - inline static int impl_thread_pool_size(int depth, OpenMP const& = OpenMP()); - // use UniqueToken static int impl_max_hardware_threads() noexcept; @@ -186,8 +190,8 @@ inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept { return false; } -inline int OpenMP::impl_thread_pool_size(int depth, OpenMP const& exec_space) { - return depth < 2 ? impl_thread_pool_size(exec_space) : 1; +inline int OpenMP::impl_thread_pool_size(int depth) const { + return depth < 2 ? impl_thread_pool_size() : 1; } KOKKOS_INLINE_FUNCTION diff --git a/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/core/src/OpenMP/Kokkos_OpenMP_Task.hpp index ff7e16c384..669607392b 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Task.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -156,7 +156,7 @@ class TaskQueueSpecialization> { } static uint32_t get_max_team_count(execution_space const& espace) { - return static_cast(OpenMP::impl_thread_pool_size(espace)); + return static_cast(espace.impl_thread_pool_size()); } // TODO @tasking @optimization DSH specialize this for trivially destructible @@ -189,7 +189,8 @@ class TaskQueueSpecializationConstrained< using task_base_type = typename scheduler_type::task_base; using queue_type = typename scheduler_type::queue_type; - if (1 == OpenMP::impl_thread_pool_size()) { + execution_space exec; + if (1 == exec.impl_thread_pool_size()) { task_base_type* const end = (task_base_type*)task_base_type::EndTag; HostThreadTeamData& team_data_single = diff --git a/core/src/OpenMP/Kokkos_OpenMP_Team.hpp b/core/src/OpenMP/Kokkos_OpenMP_Team.hpp index 280b1701ad..dbc30c5d02 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Team.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Team.hpp @@ -59,7 +59,7 @@ class TeamPolicyInternal template int team_size_max(const FunctorType&, const ParallelForTag&) const { - int pool_size = traits::execution_space::impl_thread_pool_size(1, m_space); + int pool_size = m_space.impl_thread_pool_size(1); int max_host_team_size = Impl::HostThreadTeamData::max_team_members; return pool_size < max_host_team_size ? pool_size : max_host_team_size; } @@ -68,7 +68,7 @@ class TeamPolicyInternal template int team_size_max(const FunctorType&, const ParallelReduceTag&) const { - int pool_size = traits::execution_space::impl_thread_pool_size(1, m_space); + int pool_size = m_space.impl_thread_pool_size(1); int max_host_team_size = Impl::HostThreadTeamData::max_team_members; return pool_size < max_host_team_size ? pool_size : max_host_team_size; } @@ -79,12 +79,12 @@ class TeamPolicyInternal } template int team_size_recommended(const FunctorType&, const ParallelForTag&) const { - return traits::execution_space::impl_thread_pool_size(2, m_space); + return m_space.impl_thread_pool_size(2); } template int team_size_recommended(const FunctorType&, const ParallelReduceTag&) const { - return traits::execution_space::impl_thread_pool_size(2, m_space); + return m_space.impl_thread_pool_size(2); } template inline int team_size_recommended(const FunctorType& f, const ReducerType&, @@ -120,10 +120,8 @@ class TeamPolicyInternal typename traits::execution_space m_space; inline void init(const int league_size_request, const int team_size_request) { - const int pool_size = - traits::execution_space::impl_thread_pool_size(0, m_space); - const int team_grain = - traits::execution_space::impl_thread_pool_size(2, m_space); + const int pool_size = m_space.impl_thread_pool_size(0); + const int team_grain = m_space.impl_thread_pool_size(2); const int max_host_team_size = Impl::HostThreadTeamData::max_team_members; const int team_max = ((pool_size < max_host_team_size) ? pool_size : max_host_team_size); @@ -192,8 +190,7 @@ class TeamPolicyInternal m_tune_team(true), m_tune_vector(false), m_space(space) { - init(league_size_request, - traits::execution_space::impl_thread_pool_size(2, m_space)); + init(league_size_request, m_space.impl_thread_pool_size(2)); } TeamPolicyInternal(const typename traits::execution_space& space, @@ -207,8 +204,7 @@ class TeamPolicyInternal m_tune_team(true), m_tune_vector(true), m_space(space) { - init(league_size_request, - traits::execution_space::impl_thread_pool_size(2, m_space)); + init(league_size_request, m_space.impl_thread_pool_size(2)); } TeamPolicyInternal(const typename traits::execution_space& space, @@ -242,8 +238,7 @@ class TeamPolicyInternal m_chunk_size(0), m_tune_team(true), m_tune_vector(false) { - init(league_size_request, - traits::execution_space::impl_thread_pool_size(2, m_space)); + init(league_size_request, m_space.impl_thread_pool_size(2)); } TeamPolicyInternal(int league_size_request, @@ -255,8 +250,7 @@ class TeamPolicyInternal m_chunk_size(0), m_tune_team(true), m_tune_vector(true) { - init(league_size_request, - traits::execution_space::impl_thread_pool_size(2, m_space)); + init(league_size_request, m_space.impl_thread_pool_size(2)); } TeamPolicyInternal(int league_size_request, int team_size_request, @@ -310,9 +304,7 @@ class TeamPolicyInternal private: /** \brief finalize chunk_size if it was set to AUTO*/ inline void set_auto_chunk_size() { - int concurrency = - traits::execution_space::impl_thread_pool_size(0, m_space) / - m_team_alloc; + int concurrency = m_space.impl_thread_pool_size(0) / m_team_alloc; if (concurrency == 0) concurrency = 1; if (m_chunk_size > 0) { diff --git a/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp b/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp index 0f195aa06d..a37e1758a2 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp @@ -22,26 +22,31 @@ namespace Kokkos::Experimental { template <> class UniqueToken { + public: + using execution_space = OpenMP; + using size_type = int; + private: using buffer_type = Kokkos::View; - int m_count; + execution_space m_exec; + size_type m_count; buffer_type m_buffer_view; uint32_t volatile* m_buffer; public: - using execution_space = OpenMP; - using size_type = int; - /// \brief create object size for concurrency on the given instance /// /// This object should not be shared between instances - UniqueToken(execution_space const& = execution_space()) noexcept - : m_count(::Kokkos::OpenMP::impl_thread_pool_size()), + UniqueToken(execution_space const& exec = execution_space()) noexcept + : m_exec(exec), + m_count(m_exec.impl_thread_pool_size()), m_buffer_view(buffer_type()), m_buffer(nullptr) {} - UniqueToken(size_type max_size, execution_space const& = execution_space()) - : m_count(max_size), + UniqueToken(size_type max_size, + execution_space const& exec = execution_space()) + : m_exec(exec), + m_count(max_size), m_buffer_view("UniqueToken::m_buffer_view", ::Kokkos::Impl::concurrent_bitset::buffer_bound(m_count)), m_buffer(m_buffer_view.data()) {} @@ -58,8 +63,8 @@ class UniqueToken { KOKKOS_INLINE_FUNCTION int acquire() const noexcept { KOKKOS_IF_ON_HOST( - (if (m_count >= ::Kokkos::OpenMP::impl_thread_pool_size()) return :: - Kokkos::OpenMP::impl_thread_pool_rank(); + (if (m_count >= m_exec.impl_thread_pool_size()) return m_exec + .impl_thread_pool_rank(); const ::Kokkos::pair result = ::Kokkos::Impl::concurrent_bitset::acquire_bounded( m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count); @@ -78,10 +83,9 @@ class UniqueToken { /// \brief release a value acquired by generate KOKKOS_INLINE_FUNCTION void release(int i) const noexcept { - KOKKOS_IF_ON_HOST( - (if (m_count < ::Kokkos::OpenMP::impl_thread_pool_size()) { - ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i); - })) + KOKKOS_IF_ON_HOST((if (m_count < m_exec.impl_thread_pool_size()) { + ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i); + })) KOKKOS_IF_ON_DEVICE(((void)i;)) } diff --git a/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp b/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp index 8ad9f176d7..a030a2b706 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp @@ -49,7 +49,8 @@ class ParallelFor, // We need to introduce pool_size to work around NVHPC 22.5 ICE // We need to use [[maybe_unused]] to work around an unused-variable warning // from HIP - [[maybe_unused]] int pool_size = OpenMP::impl_thread_pool_size(); + OpenMP exec; + [[maybe_unused]] int pool_size = exec.impl_thread_pool_size(); #pragma omp parallel num_threads(pool_size) { // Spin until COMPLETED_TOKEN. From 97ad51b9b3d337a462d022c30f797768f355a323 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Feb 2023 10:36:35 -0500 Subject: [PATCH 152/496] Fix unused parameter warning in SYCL lock array and add comment --- tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp b/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp index d0691de85b..8216f9a797 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp @@ -127,15 +127,16 @@ inline void unlock_address_sycl(void* ptr, MemoryScopeNode) { lock_node_ref(SYCL_SPACE_ATOMIC_LOCKS_NODE[offset]); lock_node_ref.exchange(0); } -#else + +#else // not supported template -void init_lock_arrays_sycl(sycl::queue q) { +void init_lock_arrays_sycl(sycl::queue) { assert(false); } template -void finalize_lock_arrays_sycl(sycl::queue q) { +void finalize_lock_arrays_sycl(sycl::queue) { assert(false); } From 5ea96bcaa746c5a4054a225542207274547a27ba Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 2 Feb 2023 16:56:39 +0100 Subject: [PATCH 153/496] Update HPX backend to use HPX's sender/receiver functionality (#5628) * Use sender functionality in HPX backend * Disable certain HPX tests with async dispatch disabled * Add optimization to HPX backend to run certain kernels inline * Make sure HPX TeamPolicy actually stores the execution space instance * Add unit tests to check that independent HPX instances are correctly fenced * Slightly refactor instance locking in HPX backend to avoid taking lock twice * Update includes in HPX backend * Fix license header in HPX test * Check that instance data is non-null in HPX backend before accessing it * Reuse existing pointer variable in HPX backend ParallelReduce * Use KOKKOS_ASSERT to replace a few KOKKOS_ENABLE_DEBUG + Kokkos::abort in HPX backend * Use KOKKOS_EXPECTS instead of KOKKOS_ASSERT in HPX backend * Fix ifdef for CUDA in TestViewMapping_a.hpp * Use ASSERT_FALSE instead of ASSERT_TRUE in HPX independent instances test * Use member variable for calculating buffer size in HPX backend * Use unique_ptr for hpx_thread_buffer instead of manual lifetime management * Use HostSharedPtr instead of std::shared_ptr in HPX backend * Refactor HPX backend fence functions Move default arguments to public fence member function. * Replace CUDA lambda ifdefs with KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA in TestViewMapping_a.hpp * Make hpx_range members non-const * Explicitly default some destructors in HPX backend * Don't use make_unique for thread buffer allocation in HPX backend * Make KOKKOS_ENABLE_HPX_ASYNC_DISPATCH an IMPL option The option should primarily be used for debugging. * Fix formatting * Attempt to update ifdef in TestViewMapping_a.hpp again * Format Kokkos_HPX.hpp --- .../continuous-integration-workflow-hpx.yml | 3 +- BUILD.md | 4 +- Makefile.kokkos | 2 +- cmake/KokkosCore_config.h.in | 2 +- cmake/kokkos_enable_options.cmake | 4 +- .../unit_tests/TestWithoutInitializing.hpp | 7 + core/src/HPX/Kokkos_HPX.cpp | 181 +- core/src/HPX/Kokkos_HPX_Task.hpp | 250 ++- core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp | 61 +- core/src/Kokkos_HPX.hpp | 1644 ++++++----------- core/src/impl/Kokkos_ExecSpaceManager.hpp | 2 - core/src/impl/Kokkos_HostSpace_deepcopy.cpp | 3 +- core/unit_test/CMakeLists.txt | 1 + core/unit_test/TestViewMapping_a.hpp | 5 +- .../hpx/TestHPX_IndependentInstances.cpp | 19 +- ...X_IndependentInstancesDelayedExecution.cpp | 26 +- ...estHPX_IndependentInstancesInstanceIds.cpp | 35 +- ...estHPX_IndependentInstancesRefCounting.cpp | 45 +- ...PX_IndependentInstancesSynchronization.cpp | 162 ++ core/unit_test/tools/TestEventCorrectness.hpp | 14 + 20 files changed, 1164 insertions(+), 1306 deletions(-) create mode 100644 core/unit_test/hpx/TestHPX_IndependentInstancesSynchronization.cpp diff --git a/.github/workflows/continuous-integration-workflow-hpx.yml b/.github/workflows/continuous-integration-workflow-hpx.yml index ef316b014b..e4584aa492 100644 --- a/.github/workflows/continuous-integration-workflow-hpx.yml +++ b/.github/workflows/continuous-integration-workflow-hpx.yml @@ -29,7 +29,7 @@ jobs: uses: actions/checkout@v3 with: repository: STELLAR-GROUP/hpx - ref: 1.7.1 + ref: 1.8.0 path: hpx - uses: actions/cache@v3 id: cache-hpx @@ -74,7 +74,6 @@ jobs: -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_HPX=ON \ - -DKokkos_ENABLE_HPX_ASYNC_DISPATCH=ON \ -DKokkos_ENABLE_SERIAL=OFF \ -DKokkos_ENABLE_TESTS=ON \ .. diff --git a/BUILD.md b/BUILD.md index b0d603e6db..cfeed1044d 100644 --- a/BUILD.md +++ b/BUILD.md @@ -174,9 +174,9 @@ Options can be enabled by specifying `-DKokkos_ENABLE_X`. * Kokkos_ENABLE_EXAMPLES * Whether to enable building examples * BOOL Default: OFF -* Kokkos_ENABLE_HPX_ASYNC_DISPATCH +* Kokkos_ENABLE_IMPL_HPX_ASYNC_DISPATCH * Whether HPX supports asynchronous dispatch - * BOOL Default: OFF + * BOOL Default: ON * Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC * Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2). This is an experimental performance feature and currently has issue when using with UCX. See https://github.com/kokkos/kokkos/issues/4228 for more details. * BOOL Default: OFF diff --git a/Makefile.kokkos b/Makefile.kokkos index c0ca398570..4fe9268b50 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -697,7 +697,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) ifeq ($(KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HPX_ASYNC_DISPATCH") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH") endif endif diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index badfac1499..509a0d44a2 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -40,7 +40,7 @@ #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS -#cmakedefine KOKKOS_ENABLE_HPX_ASYNC_DISPATCH +#cmakedefine KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH #cmakedefine KOKKOS_ENABLE_DEBUG #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index 478821c525..558a7353a2 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -35,7 +35,7 @@ KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3 OFF "Whether code deprecated in major KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") -KOKKOS_ENABLE_OPTION(HPX_ASYNC_DISPATCH OFF "Whether HPX supports asynchronous dispatch") +KOKKOS_ENABLE_OPTION(IMPL_HPX_ASYNC_DISPATCH ON "Whether HPX supports asynchronous dispatch") KOKKOS_ENABLE_OPTION(TESTS OFF "Whether to build the unit tests") KOKKOS_ENABLE_OPTION(BENCHMARKS OFF "Whether to build the benchmarks") KOKKOS_ENABLE_OPTION(EXAMPLES OFF "Whether to build the examples") @@ -119,7 +119,7 @@ ENDFUNCTION() CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC) CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS HPX_ASYNC_DISPATCH) +CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) # Needed due to change from deprecated name to new header define name IF (KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) diff --git a/containers/unit_tests/TestWithoutInitializing.hpp b/containers/unit_tests/TestWithoutInitializing.hpp index d3bb05195c..0554ddd1a5 100644 --- a/containers/unit_tests/TestWithoutInitializing.hpp +++ b/containers/unit_tests/TestWithoutInitializing.hpp @@ -373,6 +373,13 @@ TEST(TEST_CATEGORY, realloc_exec_space_scatterview) { if (std::is_same::value) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif +#if defined(KOKKOS_ENABLE_HPX) && \ + !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) + if (std::is_same::value) + GTEST_SKIP() << "skipping since the HPX backend always fences with async " + "dispatch disabled"; +#endif using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableFences()); diff --git a/core/src/HPX/Kokkos_HPX.cpp b/core/src/HPX/Kokkos_HPX.cpp index 2074123a15..aadac9e8e1 100644 --- a/core/src/HPX/Kokkos_HPX.cpp +++ b/core/src/HPX/Kokkos_HPX.cpp @@ -27,30 +27,131 @@ #include #include +#include #include #include #include #include -#include #include +#include #include #include namespace Kokkos { +namespace Impl { +void hpx_thread_buffer::resize(const std::size_t num_threads, + const std::size_t size_per_thread, + const std::size_t extra_space) noexcept { + m_num_threads = num_threads; + m_size_per_thread = size_per_thread; + m_extra_space = extra_space; + + pad_to_cache_line(m_size_per_thread); + + std::size_t size_total_new = + m_num_threads * m_size_per_thread + m_extra_space; + + if (m_size_total < size_total_new) { + // Don't use make_unique here as it value-initializes the elements of the + // array, which we have no use for, and can be very slow for large arrays. + m_data = std::unique_ptr(new char[size_total_new]); + m_size_total = size_total_new; + } +} + +void *hpx_thread_buffer::get(std::size_t thread_num) const noexcept { + KOKKOS_EXPECTS(thread_num < m_num_threads); + if (!m_data) { + return nullptr; + } + return &m_data[thread_num * m_size_per_thread]; +} + +void *hpx_thread_buffer::get_extra_space() const noexcept { + KOKKOS_EXPECTS(m_extra_space > 0); + if (!m_data) { + return nullptr; + } + return &m_data[m_num_threads * m_size_per_thread]; +} +} // namespace Impl + namespace Experimental { bool HPX::m_hpx_initialized = false; -#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) std::atomic HPX::m_next_instance_id{HPX::impl_default_instance_id() + 1}; uint32_t HPX::m_active_parallel_region_count{0}; hpx::spinlock HPX::m_active_parallel_region_count_mutex; hpx::condition_variable_any HPX::m_active_parallel_region_count_cond; HPX::instance_data HPX::m_default_instance_data; -#else -Kokkos::Impl::thread_buffer HPX::m_default_buffer; -#endif + +void HPX::print_configuration(std::ostream &os, const bool) const { + os << "HPX backend\n"; + os << "HPX Execution Space:\n"; + os << " KOKKOS_ENABLE_HPX: yes\n"; + os << "\nHPX Runtime Configuration:\n"; +} + +void HPX::impl_decrement_active_parallel_region_count() { + std::unique_lock l(m_active_parallel_region_count_mutex); + if (--m_active_parallel_region_count == 0) { + l.unlock(); + m_active_parallel_region_count_cond.notify_all(); + }; +} + +void HPX::impl_increment_active_parallel_region_count() { + std::unique_lock l(m_active_parallel_region_count_mutex); + ++m_active_parallel_region_count; +} + +void HPX::impl_instance_fence_locked(const std::string &name) const { + Kokkos::Tools::Experimental::Impl::profile_fence_event< + Kokkos::Experimental::HPX>( + name, + Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{ + impl_instance_id()}, + [&]() { + auto &s = impl_get_sender(); + + hpx::this_thread::experimental::sync_wait(std::move(s)); + s = hpx::execution::experimental::unique_any_sender( + hpx::execution::experimental::just()); + }); +} + +void HPX::impl_instance_fence(const std::string &name) const { + std::lock_guard l(impl_get_sender_mutex()); + impl_instance_fence_locked(name); +} + +void HPX::impl_static_fence(const std::string &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event< + Kokkos::Experimental::HPX>( + name, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + GlobalDeviceSynchronization, + [&]() { + auto &s = HPX().impl_get_sender(); + + std::unique_lock l(HPX().impl_get_sender_mutex()); + + // This is a loose fence. Any work scheduled before this will be waited + // for, but work scheduled while waiting may also be waited for. + { + std::unique_lock l_count( + m_active_parallel_region_count_mutex); + m_active_parallel_region_count_cond.wait( + l_count, [&]() { return m_active_parallel_region_count == 0; }); + } + + hpx::this_thread::experimental::sync_wait(std::move(s)); + s = hpx::execution::experimental::unique_any_sender( + hpx::execution::experimental::just()); + }); +} int HPX::concurrency() { hpx::runtime *rt = hpx::get_runtime_ptr(); @@ -106,10 +207,78 @@ void HPX::impl_finalize() { } } +int HPX::impl_thread_pool_size() noexcept { + hpx::runtime *rt = hpx::get_runtime_ptr(); + if (rt == nullptr) { + return 0; + } else { + if (hpx::threads::get_self_ptr() == nullptr) { + return hpx::resource::get_thread_pool(0).get_os_thread_count(); + } else { + return hpx::this_thread::get_pool()->get_os_thread_count(); + } + } +} + +int HPX::impl_thread_pool_rank() noexcept { + hpx::runtime *rt = hpx::get_runtime_ptr(); + if (rt == nullptr) { + return 0; + } else { + if (hpx::threads::get_self_ptr() == nullptr) { + return 0; + } else { + return hpx::this_thread::get_pool()->get_pool_index(); + } + } +} + +int HPX::impl_thread_pool_size(int depth) { + if (depth == 0) { + return impl_thread_pool_size(); + } else { + return 1; + } +} + +template void HPX::impl_bulk_plain_erased( + bool, bool, std::function &&, int const, + hpx::threads::thread_stacksize stacksize) const; + +template void HPX::impl_bulk_plain_erased( + bool, bool, std::function &&, unsigned int const, + hpx::threads::thread_stacksize stacksize) const; + +template void HPX::impl_bulk_plain_erased( + bool, bool, std::function &&, long const, + hpx::threads::thread_stacksize stacksize) const; + +template void HPX::impl_bulk_plain_erased( + bool, bool, std::function &&, std::size_t const, + hpx::threads::thread_stacksize stacksize) const; + +template void HPX::impl_bulk_setup_finalize_erased( + bool, bool, std::function &&, std::function &&, + std::function &&, int const, + hpx::threads::thread_stacksize stacksize) const; + +template void HPX::impl_bulk_setup_finalize_erased( + bool, bool, std::function &&, std::function &&, + std::function &&, unsigned int const, + hpx::threads::thread_stacksize stacksize) const; + +template void HPX::impl_bulk_setup_finalize_erased( + bool, bool, std::function &&, std::function &&, + std::function &&, long const, + hpx::threads::thread_stacksize stacksize) const; + +template void HPX::impl_bulk_setup_finalize_erased( + bool, bool, std::function &&, std::function &&, + std::function &&, std::size_t const, + hpx::threads::thread_stacksize stacksize) const; } // namespace Experimental namespace Impl { - int g_hpx_space_factory_initialized = initialize_space_factory("060_HPX"); diff --git a/core/src/HPX/Kokkos_HPX_Task.hpp b/core/src/HPX/Kokkos_HPX_Task.hpp index e75b7be49d..7c87802948 100644 --- a/core/src/HPX/Kokkos_HPX_Task.hpp +++ b/core/src/HPX/Kokkos_HPX_Task.hpp @@ -39,6 +39,51 @@ template class TaskQueueSpecialization< SimpleTaskScheduler> { public: + void setup() const { + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + + hpx_thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer(); + buffer.resize(num_worker_threads, 512); + } + + void execute_range(int t) const { + // NOTE: This implementation has been simplified based on the + // assumption that team_size = 1. The HPX backend currently only + // supports a team size of 1. + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + + hpx_thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer(); + + buffer.get(t); + HPXTeamMember member( + TeamPolicyInternal( + Kokkos::Experimental::HPX(), num_worker_threads, 1), + 0, t, buffer.get(t), 512); + + member_type single_exec(*scheduler, member); + member_type &team_exec = single_exec; + + auto &queue = scheduler->queue(); + auto &team_scheduler = team_exec.scheduler(); + + using task_base_type = typename scheduler_type::task_base_type; + auto current_task = OptionalRef(nullptr); + + while (!queue.is_done()) { + current_task = queue.pop_ready_task(team_scheduler.team_scheduler_info()); + + if (current_task) { + KOKKOS_EXPECTS(current_task->is_single_runnable() || + current_task->is_team_runnable()); + current_task->as_runnable_task().run(single_exec); + queue.complete((*std::move(current_task)).as_runnable_task(), + team_scheduler.team_scheduler_info()); + } + } + } + + void finalize() const {} + using execution_space = Kokkos::Experimental::HPX; using scheduler_type = SimpleTaskScheduler; @@ -47,69 +92,14 @@ class TaskQueueSpecialization< using memory_space = Kokkos::HostSpace; static void execute(scheduler_type const &scheduler) { - // NOTE: We create an instance so that we can use dispatch_execute_task. + // NOTE: We create an instance so that we can use impl_bulk_setup_finalize. // This is not necessarily the most efficient, but can be improved later. TaskQueueSpecialization task_queue; - task_queue.scheduler = &scheduler; - Kokkos::Impl::dispatch_execute_task(&task_queue, - Kokkos::Experimental::HPX()); - Kokkos::Experimental::HPX().fence( - "Kokkos::Impl::TaskQueueSpecialization::execute: fence " - "after task execution"); - } - - // Must provide task queue execution function - void execute_task() const { - // See [note 1] in Kokkos_HPX.hpp for an explanation. The work graph policy - // does not store an execution space instance, so we only need to reset the - // parallel region count here. - Kokkos::Experimental::HPX::reset_count_on_exit_parallel reset_count_on_exit; - - using hpx::for_loop; - using hpx::execution::par; - using hpx::execution::static_chunk_size; - using task_base_type = typename scheduler_type::task_base_type; - + task_queue.scheduler = &scheduler; const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); - - thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer(); - buffer.resize(num_worker_threads, 512); - - auto &queue = scheduler->queue(); - - for_loop(par.with(static_chunk_size(1)), 0, num_worker_threads, - [this, &queue, &buffer, num_worker_threads](int) { - // NOTE: This implementation has been simplified based on the - // assumption that team_size = 1. The HPX backend currently only - // supports a team size of 1. - std::size_t t = - Kokkos::Experimental::HPX::impl_hardware_thread_id(); - - buffer.get(t); - HPXTeamMember member( - TeamPolicyInternal( - Kokkos::Experimental::HPX(), num_worker_threads, 1), - 0, t, buffer.get(t), 512); - - member_type single_exec(*scheduler, member); - member_type &team_exec = single_exec; - - auto &team_scheduler = team_exec.scheduler(); - auto current_task = OptionalRef(nullptr); - - while (!queue.is_done()) { - current_task = - queue.pop_ready_task(team_scheduler.team_scheduler_info()); - - if (current_task) { - KOKKOS_ASSERT(current_task->is_single_runnable() || - current_task->is_team_runnable()); - current_task->as_runnable_task().run(single_exec); - queue.complete((*std::move(current_task)).as_runnable_task(), - team_scheduler.team_scheduler_info()); - } - } - }); + Kokkos::Experimental::HPX().impl_bulk_setup_finalize( + true, false, task_queue, num_worker_threads, + hpx::threads::thread_stacksize::nostack); } static uint32_t get_max_team_count(execution_space const &espace) { @@ -133,6 +123,66 @@ class TaskQueueSpecializationConstrained< std::enable_if_t::value>> { public: + void setup() const { + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + + hpx_thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer(); + buffer.resize(num_worker_threads, 512); + + auto &queue = scheduler->queue(); + queue.initialize_team_queues(num_worker_threads); + } + + void execute_range(int t) const { + // NOTE: This implementation has been simplified based on the + // assumption that team_size = 1. The HPX backend currently only + // supports a team size of 1. + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + + hpx_thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer(); + + buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()); + HPXTeamMember member( + TeamPolicyInternal( + Kokkos::Experimental::HPX(), num_worker_threads, 1), + 0, t, buffer.get(t), 512); + + using task_base_type = typename scheduler_type::task_base; + using queue_type = typename scheduler_type::queue_type; + + static task_base_type *const end = (task_base_type *)task_base_type::EndTag; + constexpr task_base_type *no_more_tasks_sentinel = nullptr; + + member_type single_exec(*scheduler, member); + member_type &team_exec = single_exec; + + auto &team_queue = team_exec.scheduler().queue(); + task_base_type *task = no_more_tasks_sentinel; + + do { + if (task != no_more_tasks_sentinel && task != end) { + team_queue.complete(task); + } + + if (*((volatile int *)&team_queue.m_ready_count) > 0) { + task = end; + for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { + for (int j = 0; j < 2 && end == task; ++j) { + task = queue_type::pop_ready_task(&team_queue.m_ready[i][j]); + } + } + } else { + task = team_queue.attempt_to_steal_task(); + } + + if (task != no_more_tasks_sentinel && task != end) { + (*task->m_apply)(task, &single_exec); + } + } while (task != no_more_tasks_sentinel); + } + + void finalize() const {} + using execution_space = Kokkos::Experimental::HPX; using scheduler_type = Scheduler; using member_type = @@ -175,82 +225,14 @@ class TaskQueueSpecializationConstrained< } static void execute(scheduler_type const &scheduler) { - // NOTE: We create an instance so that we can use dispatch_execute_task. + // NOTE: We create an instance so that we can use impl_bulk_setup_finalize. // This is not necessarily the most efficient, but can be improved later. TaskQueueSpecializationConstrained task_queue; - task_queue.scheduler = &scheduler; - Kokkos::Impl::dispatch_execute_task(&task_queue, - Kokkos::Experimental::HPX()); - Kokkos::Experimental::HPX().fence( - "Kokkos::Impl::TaskQueueSpecialization::execute: fence " - "after task execution"); - } - - // Must provide task queue execution function - void execute_task() const { - // See [note 1] in Kokkos_HPX.hpp for an explanation. The work graph policy - // does not store an execution space instance, so we only need to reset the - // parallel region count here. - Kokkos::Experimental::HPX::reset_count_on_exit_parallel reset_count_on_exit; - - using hpx::for_loop; - using hpx::execution::par; - using hpx::execution::static_chunk_size; - - using task_base_type = typename scheduler_type::task_base; - using queue_type = typename scheduler_type::queue_type; - - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); - static task_base_type *const end = (task_base_type *)task_base_type::EndTag; - constexpr task_base_type *no_more_tasks_sentinel = nullptr; - - thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer(); - buffer.resize(num_worker_threads, 512); - - auto &queue = scheduler->queue(); - queue.initialize_team_queues(num_worker_threads); - - auto exec = Kokkos::Experimental::HPX::impl_get_executor(); - - for_loop( - par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads, - [this, &buffer, num_worker_threads](int t) { - // NOTE: This implementation has been simplified based on the - // assumption that team_size = 1. The HPX backend currently only - // supports a team size of 1. - buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()); - HPXTeamMember member( - TeamPolicyInternal( - Kokkos::Experimental::HPX(), num_worker_threads, 1), - 0, t, buffer.get(t), 512); - - member_type single_exec(*scheduler, member); - member_type &team_exec = single_exec; - - auto &team_queue = team_exec.scheduler().queue(); - task_base_type *task = no_more_tasks_sentinel; - - do { - if (task != no_more_tasks_sentinel && task != end) { - team_queue.complete(task); - } - - if (*((volatile int *)&team_queue.m_ready_count) > 0) { - task = end; - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_ready_task(&team_queue.m_ready[i][j]); - } - } - } else { - task = team_queue.attempt_to_steal_task(); - } - - if (task != no_more_tasks_sentinel && task != end) { - (*task->m_apply)(task, &single_exec); - } - } while (task != no_more_tasks_sentinel); - }); + task_queue.scheduler = &scheduler; + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + Kokkos::Experimental::HPX().impl_bulk_setup_finalize( + true, false, task_queue, num_worker_threads, + hpx::threads::thread_stacksize::nostack); } template diff --git a/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp b/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp index 72a8019935..85072b7700 100644 --- a/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp +++ b/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp @@ -35,53 +35,28 @@ class ParallelFor, Policy m_policy; FunctorType m_functor; - template - std::enable_if_t::value> execute_functor( - const std::int32_t w) const noexcept { - m_functor(w); - } - - template - std::enable_if_t::value> execute_functor( - const std::int32_t w) const noexcept { - const TagType t{}; - m_functor(t, w); - } - public: - void execute() const { - dispatch_execute_task(this, m_policy.space()); - m_policy.space().fence( - "Kokkos::Experimental::Impl::HPX::ParallelFor: fence " - "after kernel execution"); + void execute_range(int) const { + std::int32_t w = m_policy.pop_work(); + while (w != Policy::COMPLETED_TOKEN) { + if (w != Policy::END_TOKEN) { + if constexpr (std::is_same_v) { + m_functor(w); + } else { + m_functor(WorkTag{}, w); + } + m_policy.completed_work(w); + } + + w = m_policy.pop_work(); + } } - void execute_task() const { - // See [note 1] in Kokkos_HPX.hpp for an explanation. The work graph policy - // does not store an execution space instance, so we only need to reset the - // parallel region count here. - Kokkos::Experimental::HPX::reset_count_on_exit_parallel reset_count_on_exit; - + void execute() const { const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); - - using hpx::for_loop; - using hpx::execution::par; - using hpx::execution::static_chunk_size; - - auto exec = Kokkos::Experimental::HPX::impl_get_executor(); - - for_loop(par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads, - [this](int) { - std::int32_t w = m_policy.pop_work(); - while (w != Policy::COMPLETED_TOKEN) { - if (w != Policy::END_TOKEN) { - execute_functor(w); - m_policy.completed_work(w); - } - - w = m_policy.pop_work(); - } - }); + Kokkos::Experimental::HPX().impl_bulk_plain( + true, is_light_weight_policy(), *this, num_worker_threads, + hpx::threads::thread_stacksize::nostack); } inline ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) diff --git a/core/src/Kokkos_HPX.hpp b/core/src/Kokkos_HPX.hpp index 18965a12ee..1baa17b7ae 100644 --- a/core/src/Kokkos_HPX.hpp +++ b/core/src/Kokkos_HPX.hpp @@ -43,130 +43,83 @@ static_assert(false, #include #include #include +#include #include #include #include #include -#include #include #include #include #include -#include #include -#include #include #include +#include #include -#include #include -#include #include #include -// There are currently two different implementations for the parallel dispatch -// functions: -// -// - 0: The HPX way. Unfortunately, this comes with unnecessary -// overheads at the moment, so there is -// - 1: The manual way. This uses for_loop, but only spawns one task per worker -// thread. This is significantly faster in most cases. -// -// In the long run 0 should be the preferred implementation, but until HPX is -// improved 1 will be the default. -#ifndef KOKKOS_HPX_IMPLEMENTATION -#define KOKKOS_HPX_IMPLEMENTATION 1 -#endif - -#if (KOKKOS_HPX_IMPLEMENTATION < 0) || (KOKKOS_HPX_IMPLEMENTATION > 1) -#error "You have chosen an invalid value for KOKKOS_HPX_IMPLEMENTATION" -#endif - -// [note 1] -// -// When using the asynchronous backend and independent instances, we explicitly -// reset the shared data at the end of a parallel task (execute_task). We do -// this to avoid circular references with shared pointers that would otherwise -// never be released. -// -// The HPX instance holds shared data for the instance in a shared_ptr. One of -// the pieces of shared data is the future that we use to sequence parallel -// dispatches. When a parallel task is launched, a copy of the closure -// (ParallelFor, ParallelReduce, etc.) is captured in the task. The closure -// also holds the policy, the policy holds the HPX instance, the instance holds -// the shared data (for use of buffers in the parallel task). When attaching a -// continuation to a future, the continuation is stored in the future (shared -// state). This means that there is a cycle future -> continuation -> closure -// -> policy -> HPX -> shared data -> future. We break this by releasing the -// shared data early, as (the pointer to) the shared data will not be used -// anymore by the closure at the end of execute_task. -// -// We also mark the shared instance data as mutable so that we can reset it -// from the const execute_task member function. - namespace Kokkos { namespace Impl { -class thread_buffer { +class hpx_thread_buffer { static constexpr std::size_t m_cache_line_size = 64; - std::size_t m_num_threads; - std::size_t m_size_per_thread; - std::size_t m_size_total; - char *m_data; + std::size_t m_num_threads = 0; + std::size_t m_size_per_thread = 0; + std::size_t m_extra_space = 0; + std::size_t m_size_total = 0; + std::unique_ptr m_data = nullptr; - void pad_to_cache_line(std::size_t &size) { + static constexpr void pad_to_cache_line(std::size_t &size) { size = ((size + m_cache_line_size - 1) / m_cache_line_size) * m_cache_line_size; } public: - thread_buffer() - : m_num_threads(0), - m_size_per_thread(0), - m_size_total(0), - m_data(nullptr) {} - thread_buffer(const std::size_t num_threads, - const std::size_t size_per_thread) { - resize(num_threads, size_per_thread); - } - ~thread_buffer() { delete[] m_data; } - - thread_buffer(const thread_buffer &) = delete; - thread_buffer(thread_buffer &&) = delete; - thread_buffer &operator=(const thread_buffer &) = delete; - thread_buffer &operator=(thread_buffer) = delete; - - void resize(const std::size_t num_threads, - const std::size_t size_per_thread) { - m_num_threads = num_threads; - m_size_per_thread = size_per_thread; - - pad_to_cache_line(m_size_per_thread); - - std::size_t size_total_new = m_num_threads * m_size_per_thread; - - if (m_size_total < size_total_new) { - delete[] m_data; - m_data = new char[size_total_new]; - m_size_total = size_total_new; - } - } - - char *get(std::size_t thread_num) { - assert(thread_num < m_num_threads); - if (m_data == nullptr) { - return nullptr; - } - return &m_data[thread_num * m_size_per_thread]; - } + hpx_thread_buffer() = default; + ~hpx_thread_buffer() = default; + hpx_thread_buffer(const hpx_thread_buffer &) = delete; + hpx_thread_buffer(hpx_thread_buffer &&) = delete; + hpx_thread_buffer &operator=(const hpx_thread_buffer &) = delete; + hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; + + void resize(const std::size_t num_threads, const std::size_t size_per_thread, + const std::size_t extra_space = 0) noexcept; + void *get(std::size_t thread_num) const noexcept; + void *get_extra_space() const noexcept; +}; - std::size_t size_per_thread() const noexcept { return m_size_per_thread; } - std::size_t size_total() const noexcept { return m_size_total; } +template +struct hpx_range { + T begin; + T end; }; + +template +constexpr T get_num_chunks(const T offset, const T chunk_size, const T max) { + return (max - offset + chunk_size - 1) / chunk_size; +} + +template +constexpr hpx_range get_chunk_range(const T i_chunk, const T offset, + const T chunk_size, const T max) { + const T begin = offset + i_chunk * chunk_size; + const T end = (std::min)(begin + chunk_size, max); + return {begin, end}; +} + +template +constexpr bool is_light_weight_policy() { + constexpr Kokkos::Experimental::WorkItemProperty::HintLightWeight_t + light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight; + return (typename Policy::work_item_property() & light_weight) == light_weight; +} } // namespace Impl namespace Experimental { @@ -176,9 +129,6 @@ class HPX { private: static bool m_hpx_initialized; - uint32_t m_instance_id = impl_default_instance_id(); - -#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) static std::atomic m_next_instance_id; public: @@ -190,22 +140,28 @@ class HPX { static hpx::condition_variable_any m_active_parallel_region_count_cond; struct instance_data { - instance_data() = default; - instance_data(hpx::shared_future future) : m_future(future) {} - Kokkos::Impl::thread_buffer m_buffer; - hpx::shared_future m_future = hpx::make_ready_future(); - hpx::spinlock m_future_mutex; + instance_data() = default; + ~instance_data() = default; + instance_data(uint32_t instance_id) : m_instance_id(instance_id) {} + instance_data(uint32_t instance_id, + hpx::execution::experimental::unique_any_sender<> &&sender) + : m_instance_id(instance_id), m_sender{std::move(sender)} {} + + instance_data(const instance_data &) = delete; + instance_data(instance_data &&) = delete; + instance_data &operator=(const instance_data &) = delete; + instance_data &operator=(instance_data) = delete; + + uint32_t m_instance_id{HPX::impl_default_instance_id()}; + hpx::execution::experimental::unique_any_sender<> m_sender{ + hpx::execution::experimental::just()}; + Kokkos::Impl::hpx_thread_buffer m_buffer; + hpx::spinlock m_sender_mutex; }; - mutable std::shared_ptr m_independent_instance_data; + static void default_instance_deleter(instance_data *) {} static instance_data m_default_instance_data; - - std::reference_wrapper m_buffer; - std::reference_wrapper> m_future; - std::reference_wrapper m_future_mutex; -#else - static Kokkos::Impl::thread_buffer m_default_buffer; -#endif + Kokkos::Impl::HostSharedPtr m_instance_data; public: using execution_space = HPX; @@ -215,123 +171,56 @@ class HPX { using size_type = memory_space::size_type; using scratch_memory_space = ScratchMemorySpace; -#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) HPX() - noexcept - : m_instance_id(impl_default_instance_id()), - m_buffer(m_default_instance_data.m_buffer), - m_future(m_default_instance_data.m_future), - m_future_mutex(m_default_instance_data.m_future_mutex) {} - + : m_instance_data(Kokkos::Impl::HostSharedPtr( + &m_default_instance_data, &default_instance_deleter)) {} + ~HPX() = default; HPX(instance_mode mode) - : m_instance_id(mode == instance_mode::independent - ? m_next_instance_id++ - : impl_default_instance_id()), - m_independent_instance_data(mode == instance_mode::independent - ? (new instance_data()) - : nullptr), - m_buffer(mode == instance_mode::independent - ? m_independent_instance_data->m_buffer - : m_default_instance_data.m_buffer), - m_future(mode == instance_mode::independent - ? m_independent_instance_data->m_future - : m_default_instance_data.m_future), - m_future_mutex(mode == instance_mode::independent - ? m_independent_instance_data->m_future_mutex - : m_default_instance_data.m_future_mutex) {} - - HPX(hpx::shared_future future) - : m_instance_id(m_next_instance_id++), - - m_independent_instance_data(new instance_data(future)), - m_buffer(m_independent_instance_data->m_buffer), - m_future(m_independent_instance_data->m_future), - m_future_mutex(m_independent_instance_data->m_future_mutex) {} - - HPX(HPX &&other) = default; - HPX &operator=(HPX &&other) = default; - HPX(const HPX &other) = default; - HPX &operator=(const HPX &other) = default; -#else - HPX() noexcept {} -#endif + : m_instance_data( + mode == instance_mode::independent + ? (Kokkos::Impl::HostSharedPtr( + new instance_data(m_next_instance_id++))) + : Kokkos::Impl::HostSharedPtr( + &m_default_instance_data, &default_instance_deleter)) {} + HPX(hpx::execution::experimental::unique_any_sender<> &&sender) + : m_instance_data(Kokkos::Impl::HostSharedPtr( + new instance_data(m_next_instance_id++, std::move(sender)))) {} - void print_configuration(std::ostream &os, bool /*verbose*/ = false) const { - os << "HPX backend\n"; - os << "HPX Execution Space:\n"; - os << " KOKKOS_ENABLE_HPX: yes\n"; - os << "\nHPX Runtime Configuration:\n"; - } - uint32_t impl_instance_id() const noexcept { return m_instance_id; } + HPX(HPX &&other) = default; + HPX(const HPX &other) = default; -#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) - static bool in_parallel(HPX const &instance = HPX()) noexcept { - return !instance.impl_get_future().is_ready(); - } -#else - static bool in_parallel(HPX const & = HPX()) noexcept { return false; } -#endif + HPX &operator=(HPX &&) = default; + HPX &operator=(const HPX &) = default; -#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) - static void impl_decrement_active_parallel_region_count() { - std::unique_lock l(m_active_parallel_region_count_mutex); - if (--m_active_parallel_region_count == 0) { - l.unlock(); - m_active_parallel_region_count_cond.notify_all(); - }; + void print_configuration(std::ostream &os, bool /*verbose*/ = false) const; + instance_data &impl_get_instance_data() const noexcept { + KOKKOS_EXPECTS(m_instance_data.get()); + return *m_instance_data.get(); + } + uint32_t impl_instance_id() const noexcept { + return impl_get_instance_data().m_instance_id; } - static void impl_increment_active_parallel_region_count() { - std::unique_lock l(m_active_parallel_region_count_mutex); - ++m_active_parallel_region_count; + static bool in_parallel(HPX const & = HPX()) noexcept { + // TODO: Very awkward to keep track of. What should this really return? + return false; } -#endif + + static void impl_decrement_active_parallel_region_count(); + static void impl_increment_active_parallel_region_count(); + + void impl_instance_fence_locked(const std::string &name) const; + void impl_instance_fence(const std::string &name) const; + static void impl_static_fence(const std::string &name); void fence( const std::string &name = "Kokkos::Experimental::HPX::fence: Unnamed Instance Fence") const { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::HPX>( - name, - Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{ - impl_instance_id()}, - [&]() { -#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) - impl_get_future().wait(); - // Reset the future to free variables that may have been captured in - // parallel regions. - impl_get_future() = hpx::make_ready_future(); -#endif - }); - } - - static void impl_static_fence(const std::string &name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::HPX>( - name, - Kokkos::Tools::Experimental::SpecialSynchronizationCases:: - GlobalDeviceSynchronization, - [&]() { -#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) - std::unique_lock l( - m_active_parallel_region_count_mutex); - m_active_parallel_region_count_cond.wait( - l, [&]() { return m_active_parallel_region_count == 0; }); - // Reset the future to free variables that may have been captured in - // parallel regions (however, we don't have access to futures from - // instances other than the default instances, they will only be - // released by fence). - HPX().impl_get_future() = hpx::make_ready_future(); -#endif - }); - } - - static hpx::execution::parallel_executor impl_get_executor() { - return hpx::execution::parallel_executor(); + impl_instance_fence(name); } static bool is_asynchronous(HPX const & = HPX()) noexcept { -#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) +#if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) return true; #else return false; @@ -354,112 +243,203 @@ class HPX { static void impl_initialize(InitializationSettings const &); static bool impl_is_initialized() noexcept; static void impl_finalize(); + static int impl_thread_pool_size() noexcept; + static int impl_thread_pool_rank() noexcept; + static int impl_thread_pool_size(int depth); - static int impl_thread_pool_size() noexcept { - hpx::runtime *rt = hpx::get_runtime_ptr(); - if (rt == nullptr) { - return 0; - } else { - if (hpx::threads::get_self_ptr() == nullptr) { - return hpx::resource::get_thread_pool(0).get_os_thread_count(); - } else { - return hpx::this_thread::get_pool()->get_os_thread_count(); - } - } + static int impl_max_hardware_threads() noexcept { + return hpx::threads::hardware_concurrency(); } - static int impl_thread_pool_rank() noexcept { - hpx::runtime *rt = hpx::get_runtime_ptr(); - if (rt == nullptr) { - return 0; - } else { - if (hpx::threads::get_self_ptr() == nullptr) { - return 0; - } else { - return hpx::this_thread::get_pool()->get_pool_index(); - } - } + static int impl_hardware_thread_id() noexcept { + return hpx::get_worker_thread_num(); } - static int impl_thread_pool_size(int depth) { - if (depth == 0) { - return impl_thread_pool_size(); - } else { - return 1; - } + Kokkos::Impl::hpx_thread_buffer &impl_get_buffer() const noexcept { + return impl_get_instance_data().m_buffer; } - static int impl_max_hardware_threads() noexcept { - return hpx::threads::hardware_concurrency(); + hpx::execution::experimental::unique_any_sender<> &impl_get_sender() const + noexcept { + return impl_get_instance_data().m_sender; } - static int impl_hardware_thread_id() noexcept { - return hpx::get_worker_thread_num(); + hpx::execution::experimental::any_sender<> get_sender() const noexcept { + std::lock_guard l(impl_get_sender_mutex()); + auto &s = impl_get_sender(); + auto split_s = hpx::execution::experimental::split(std::move(s)); + s = split_s; + return hpx::execution::experimental::any_sender<>{split_s}; } - Kokkos::Impl::thread_buffer &impl_get_buffer() const noexcept { -#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) - return m_buffer.get(); -#else - return m_default_buffer; -#endif + hpx::future impl_get_future() const noexcept { + return hpx::execution::experimental::make_future(get_sender()); } -#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) - hpx::shared_future &impl_get_future() const noexcept { - return m_future; + hpx::spinlock &impl_get_sender_mutex() const noexcept { + return impl_get_instance_data().m_sender_mutex; } - hpx::spinlock &impl_get_future_mutex() const noexcept { - return m_future_mutex; - } -#endif + template + void impl_bulk_plain_erased( + [[maybe_unused]] bool force_synchronous, bool is_light_weight_policy, + std::function &&f, I const n, + hpx::threads::thread_stacksize stacksize = + hpx::threads::thread_stacksize::default_) const { + Kokkos::Experimental::HPX::impl_increment_active_parallel_region_count(); + + namespace ex = hpx::execution::experimental; -#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) - struct [[nodiscard]] reset_on_exit_parallel { - HPX const &m_space; - reset_on_exit_parallel(HPX const &space) : m_space(space) {} - ~reset_on_exit_parallel() { - // See [note 1] for an explanation. m_independent_instance_data is - // marked mutable. - m_space.m_independent_instance_data.reset(); + auto &sen = impl_get_sender(); + auto &mut = impl_get_sender_mutex(); - HPX::impl_decrement_active_parallel_region_count(); + std::lock_guard l(mut); + hpx::util::ignore_lock(&mut); + + { + if (n == 1 && is_light_weight_policy && + (hpx::threads::get_self_ptr() != nullptr)) { + sen = std::move(sen) | ex::then(hpx::bind_front(std::move(f), 0)) | + ex::then(Kokkos::Experimental::HPX:: + impl_decrement_active_parallel_region_count) | + ex::ensure_started(); + } else { + sen = std::move(sen) | + ex::transfer( + ex::with_stacksize(ex::thread_pool_scheduler{}, stacksize)) | + ex::bulk(n, std::move(f)) | + ex::then(Kokkos::Experimental::HPX:: + impl_decrement_active_parallel_region_count) | + ex::ensure_started(); + } } - }; - // This struct is identical to the above except it does not reset the shared - // data. It does, however, still decrement the parallel region count. It is - // meant for use in parallel regions which do not capture the execution space - // instance. - struct [[nodiscard]] reset_count_on_exit_parallel { - reset_count_on_exit_parallel() = default; - ~reset_count_on_exit_parallel() { - HPX::impl_decrement_active_parallel_region_count(); +#if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) + if (force_synchronous) +#endif + { + impl_instance_fence_locked( + "Kokkos::Experimental::HPX: fence due to forced synchronizations"); } - }; -#else - struct [[nodiscard]] reset_on_exit_parallel { - reset_on_exit_parallel(HPX const &) = default; - ~reset_on_exit_parallel() = default; - }; + } - struct [[nodiscard]] reset_count_on_exit_parallel { - reset_count_on_exit_parallel() = default; - ~reset_count_on_exit_parallel() = default; - }; + template + void impl_bulk_plain(bool force_synchronous, bool is_light_weight_policy, + Functor const &functor, Index const n, + hpx::threads::thread_stacksize stacksize = + hpx::threads::thread_stacksize::default_) const { + impl_bulk_plain_erased(force_synchronous, is_light_weight_policy, + {[functor](Index i) { functor.execute_range(i); }}, + n, stacksize); + } + + template + void impl_bulk_setup_finalize_erased( + [[maybe_unused]] bool force_synchronous, bool is_light_weight_policy, + std::function &&f, std::function &&f_setup, + std::function &&f_finalize, Index const n, + hpx::threads::thread_stacksize stacksize = + hpx::threads::thread_stacksize::default_) const { + Kokkos::Experimental::HPX::impl_increment_active_parallel_region_count(); + + namespace ex = hpx::execution::experimental; + using hpx::threads::thread_stacksize; + + auto &sen = impl_get_sender(); + auto &mut = impl_get_sender_mutex(); + + std::lock_guard l(mut); + hpx::util::ignore_lock(&mut); + + { + if (n == 1 && is_light_weight_policy && + (hpx::threads::get_self_ptr() != nullptr)) { + sen = std::move(sen) | ex::then(std::move(f_setup)) | + ex::then(hpx::bind_front(std::move(f), 0)) | + ex::then(std::move(f_finalize)) | + ex::then(Kokkos::Experimental::HPX:: + impl_decrement_active_parallel_region_count) | + ex::ensure_started(); + } else { + sen = std::move(sen) | + ex::transfer( + ex::with_stacksize(ex::thread_pool_scheduler{}, stacksize)) | + ex::then(std::move(f_setup)) | ex::bulk(n, std::move(f)) | + ex::then(std::move(f_finalize)) | + ex::then(Kokkos::Experimental::HPX:: + impl_decrement_active_parallel_region_count) | + ex::ensure_started(); + } + } + +#if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) + if (force_synchronous) #endif + { + impl_instance_fence_locked( + "Kokkos::Experimental::HPX: fence due to forced syncronizations"); + } + } + + template + void impl_bulk_setup_finalize( + bool force_synchronous, bool is_light_weight_policy, + Functor const &functor, Index const n, + hpx::threads::thread_stacksize stacksize = + hpx::threads::thread_stacksize::default_) const { + impl_bulk_setup_finalize_erased( + force_synchronous, is_light_weight_policy, + {[functor](Index i) { functor.execute_range(i); }}, + {[functor]() { functor.setup(); }}, + {[functor]() { functor.finalize(); }}, n, stacksize); + } static constexpr const char *name() noexcept { return "HPX"; } private: friend bool operator==(HPX const &lhs, HPX const &rhs) { - return lhs.m_instance_id == rhs.m_instance_id; + return lhs.impl_instance_id() == rhs.impl_instance_id(); } friend bool operator!=(HPX const &lhs, HPX const &rhs) { return !(lhs == rhs); } }; + +extern template void HPX::impl_bulk_plain_erased( + bool, bool, std::function &&, int const, + hpx::threads::thread_stacksize stacksize) const; + +extern template void HPX::impl_bulk_plain_erased( + bool, bool, std::function &&, unsigned int const, + hpx::threads::thread_stacksize stacksize) const; + +extern template void HPX::impl_bulk_plain_erased( + bool, bool, std::function &&, long const, + hpx::threads::thread_stacksize stacksize) const; + +extern template void HPX::impl_bulk_plain_erased( + bool, bool, std::function &&, std::size_t const, + hpx::threads::thread_stacksize stacksize) const; + +extern template void HPX::impl_bulk_setup_finalize_erased( + bool, bool, std::function &&, std::function &&, + std::function &&, int const, + hpx::threads::thread_stacksize stacksize) const; + +extern template void HPX::impl_bulk_setup_finalize_erased( + bool, bool, std::function &&, std::function &&, + std::function &&, unsigned int const, + hpx::threads::thread_stacksize stacksize) const; + +extern template void HPX::impl_bulk_setup_finalize_erased( + bool, bool, std::function &&, std::function &&, + std::function &&, long const, + hpx::threads::thread_stacksize stacksize) const; + +extern template void HPX::impl_bulk_setup_finalize_erased( + bool, bool, std::function &&, std::function &&, + std::function &&, std::size_t const, + hpx::threads::thread_stacksize stacksize) const; } // namespace Experimental namespace Tools { @@ -471,45 +451,6 @@ struct DeviceTypeTraits { }; } // namespace Experimental } // namespace Tools - -namespace Impl { - -#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) -template -inline void dispatch_execute_task(Closure *closure, - Kokkos::Experimental::HPX const &instance, - bool force_synchronous = false) { - Kokkos::Experimental::HPX::impl_increment_active_parallel_region_count(); - - Closure closure_copy = *closure; - - { - std::unique_lock l(instance.impl_get_future_mutex()); - hpx::util::ignore_lock(&instance.impl_get_future_mutex()); - hpx::shared_future &fut = instance.impl_get_future(); - - fut = fut.then(hpx::execution::parallel_executor( - hpx::threads::thread_schedule_hint(0)), - [closure_copy](hpx::shared_future &&) { - return closure_copy.execute_task(); - }); - } - - if (force_synchronous) { - instance.fence( - "Kokkos::Experimental::Impl::HPX::dispatch_execute_task: fence due to " - "forced syncronizations"); - } -} -#else -template -inline void dispatch_execute_task(Closure *closure, - Kokkos::Experimental::HPX const &, - bool = false) { - closure->execute_task(); -} -#endif -} // namespace Impl } // namespace Kokkos namespace Kokkos { @@ -707,12 +648,6 @@ struct HPXTeamMember { template class TeamPolicyInternal : public PolicyTraits { - int m_league_size; - int m_team_size; - std::size_t m_team_scratch_size[2]; - std::size_t m_thread_scratch_size[2]; - int m_chunk_size; - public: using traits = PolicyTraits; @@ -724,6 +659,15 @@ class TeamPolicyInternal //! Execution space of this execution policy: using execution_space = Kokkos::Experimental::HPX; + private: + typename traits::execution_space m_space{}; + int m_league_size; + int m_team_size; + std::size_t m_team_scratch_size[2]; + std::size_t m_thread_scratch_size[2]; + int m_chunk_size; + + public: // NOTE: Max size is 1 for simplicity. In most cases more than 1 is not // necessary on CPU. Implement later if there is a need. template @@ -833,14 +777,12 @@ class TeamPolicyInternal template friend class TeamPolicyInternal; - const typename traits::execution_space &space() const { - static typename traits::execution_space m_space; - return m_space; - } + const typename traits::execution_space &space() const { return m_space; } template TeamPolicyInternal(const TeamPolicyInternal &p) { + m_space = p.m_space; m_league_size = p.m_league_size; m_team_size = p.m_team_size; m_team_scratch_size[0] = p.m_team_scratch_size[0]; @@ -850,39 +792,43 @@ class TeamPolicyInternal m_chunk_size = p.m_chunk_size; } - TeamPolicyInternal(const typename traits::execution_space &, + TeamPolicyInternal(const typename traits::execution_space &space, int league_size_request, int team_size_request, int /* vector_length_request */ = 1) - : m_team_scratch_size{0, 0}, + : m_space{space}, + m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, m_chunk_size(0) { init(league_size_request, team_size_request); } - TeamPolicyInternal(const typename traits::execution_space &, + TeamPolicyInternal(const typename traits::execution_space &space, int league_size_request, const Kokkos::AUTO_t &, int /* vector_length_request */ = 1) - : m_team_scratch_size{0, 0}, + : m_space{space}, + m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, m_chunk_size(0) { init(league_size_request, 1); } - TeamPolicyInternal(const typename traits::execution_space &, + TeamPolicyInternal(const typename traits::execution_space &space, int league_size_request, const Kokkos::AUTO_t &, /* team_size_request */ const Kokkos::AUTO_t & /* vector_length_request */) - : m_team_scratch_size{0, 0}, + : m_space{space}, + m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, m_chunk_size(0) { init(league_size_request, 1); } - TeamPolicyInternal(const typename traits::execution_space &, + TeamPolicyInternal(const typename traits::execution_space &space, int league_size_request, int team_size_request, const Kokkos::AUTO_t & /* vector_length_request */ ) - : m_team_scratch_size{0, 0}, + : m_space{space}, + m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, m_chunk_size(0) { init(league_size_request, team_size_request); @@ -956,19 +902,6 @@ class TeamPolicyInternal namespace Kokkos { namespace Impl { -template -typename Policy::member_type get_hpx_adjusted_chunk_size(Policy const &policy) { - const int concurrency = Kokkos::Experimental::HPX::concurrency(); - const typename Policy::member_type n = policy.end() - policy.begin(); - typename Policy::member_type new_chunk_size = policy.chunk_size(); - - while (n >= 4 * concurrency * new_chunk_size) { - new_chunk_size *= 2; - } - - return new_chunk_size; -} - template class ParallelFor, Kokkos::Experimental::HPX> { @@ -981,71 +914,25 @@ class ParallelFor, const FunctorType m_functor; const Policy m_policy; - template - static std::enable_if_t::value> execute_functor( - const FunctorType &functor, const Member i) { - functor(i); - } - - template - static std::enable_if_t::value> execute_functor( - const FunctorType &functor, const Member i) { - const TagType t{}; - functor(t, i); - } - - template - static std::enable_if_t::value> execute_functor_range( - const FunctorType &functor, const Member i_begin, const Member i_end) { - for (Member i = i_begin; i < i_end; ++i) { - functor(i); - } - } - - template - static std::enable_if_t::value> execute_functor_range( - const FunctorType &functor, const Member i_begin, const Member i_end) { - const TagType t{}; - for (Member i = i_begin; i < i_end; ++i) { - functor(t, i); + public: + void execute_range(const Member i_chunk) const { + const auto r = get_chunk_range(i_chunk, m_policy.begin(), + m_policy.chunk_size(), m_policy.end()); + for (Member i = r.begin; i < r.end; ++i) { + if constexpr (std::is_same_v) { + m_functor(i); + } else { + m_functor(WorkTag{}, i); + } } } - public: void execute() const { - Kokkos::Impl::dispatch_execute_task(this, m_policy.space()); - } - - void execute_task() const { - // See [note 1] for an explanation. - Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( - m_policy.space()); - - auto exec = Kokkos::Experimental::HPX::impl_get_executor(); - - using hpx::execution::par; - using hpx::execution::static_chunk_size; - -#if KOKKOS_HPX_IMPLEMENTATION == 0 - using hpx::for_loop; - - for_loop(par.on(exec).with(static_chunk_size(m_policy.chunk_size())), - m_policy.begin(), m_policy.end(), [this](const Member i) { - execute_functor(m_functor, i); - }); - -#elif KOKKOS_HPX_IMPLEMENTATION == 1 - using hpx::for_loop_strided; - - const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy); - - for_loop_strided( - par.on(exec), m_policy.begin(), m_policy.end(), chunk_size, - [this, chunk_size](const Member i_begin) { - const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end()); - execute_functor_range(m_functor, i_begin, i_end); - }); -#endif + const Member num_chunks = + get_num_chunks(m_policy.begin(), m_policy.chunk_size(), m_policy.end()); + m_policy.space().impl_bulk_plain(false, is_light_weight_policy(), + *this, num_chunks, + hpx::threads::thread_stacksize::nostack); } inline ParallelFor(const FunctorType &arg_functor, Policy arg_policy) @@ -1069,40 +956,20 @@ class ParallelFor, const Policy m_policy; public: - void execute() const { dispatch_execute_task(this, m_iter.m_rp.space()); } - - inline void execute_task() const { - // See [note 1] for an explanation. - Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( - m_iter.m_rp.space()); - - auto exec = Kokkos::Experimental::HPX::impl_get_executor(); - - using hpx::execution::par; - using hpx::execution::static_chunk_size; - -#if KOKKOS_HPX_IMPLEMENTATION == 0 - using hpx::for_loop; - - for_loop(par.on(exec).with( - static_chunk_size(get_hpx_adjusted_chunk_size(m_policy))), - m_policy.begin(), m_policy.end(), - [this](const Member i) { iterate_type(i); }); - -#elif KOKKOS_HPX_IMPLEMENTATION == 1 - using hpx::for_loop_strided; - - const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy); + void execute_range(const Member i_chunk) const { + const auto r = get_chunk_range(i_chunk, m_policy.begin(), + m_policy.chunk_size(), m_policy.end()); + for (Member i = r.begin; i < r.end; ++i) { + m_iter(i); + } + } - for_loop_strided(par.on(exec), m_policy.begin(), m_policy.end(), chunk_size, - [this, chunk_size](const Member i_begin) { - const Member i_end = - (std::min)(i_begin + chunk_size, m_policy.end()); - for (Member i = i_begin; i < i_end; ++i) { - m_iter(i); - } - }); -#endif + void execute() const { + const Member num_chunks = + get_num_chunks(m_policy.begin(), m_policy.chunk_size(), m_policy.end()); + m_iter.m_rp.space().impl_bulk_plain( + false, is_light_weight_policy(), *this, num_chunks, + hpx::threads::thread_stacksize::nostack); } inline ParallelFor(const FunctorType &arg_functor, MDRangePolicy arg_policy) @@ -1132,8 +999,8 @@ class ParallelReduce, ReducerType, using WorkRange = typename Policy::WorkRange; using Member = typename Policy::member_type; using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; + Kokkos::Impl::if_c, FunctorType, + ReducerType>; using ReducerTypeFwd = typename ReducerConditional::type; using Analysis = FunctorAnalysis; @@ -1145,198 +1012,46 @@ class ParallelReduce, ReducerType, const Policy m_policy; const ReducerType m_reducer; const pointer_type m_result_ptr; + const bool m_force_synchronous; - bool m_force_synchronous; - - template - inline static std::enable_if_t::value> execute_functor( - const FunctorType &functor, const Member i, reference_type update) { - functor(i, update); - } - - template - inline static std::enable_if_t::value> execute_functor( - const FunctorType &functor, const Member i, reference_type update) { - const TagType t{}; - functor(t, i, update); - } + public: + void setup() const { + const std::size_t value_size = + Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); - template - inline std::enable_if_t::value> execute_functor_range( - reference_type update, const Member i_begin, const Member i_end) const { - for (Member i = i_begin; i < i_end; ++i) { - m_functor(i, update); - } - } + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + buffer.resize(num_worker_threads, value_size); - template - inline std::enable_if_t::value> execute_functor_range( - reference_type update, const Member i_begin, const Member i_end) const { - const TagType t{}; + typename Analysis::Reducer final_reducer( + &ReducerConditional::select(m_functor, m_reducer)); - for (Member i = i_begin; i < i_end; ++i) { - m_functor(t, i, update); + for (int t = 0; t < num_worker_threads; ++t) { + final_reducer.init(reinterpret_cast(buffer.get(t))); } } - class value_type_wrapper { - private: - std::size_t m_value_size; - char *m_value_buffer; - - public: - value_type_wrapper() : m_value_size(0), m_value_buffer(nullptr) {} - - value_type_wrapper(const std::size_t value_size) - : m_value_size(value_size), m_value_buffer(new char[m_value_size]) {} - - value_type_wrapper(const value_type_wrapper &other) - : m_value_size(0), m_value_buffer(nullptr) { - if (this != &other) { - m_value_buffer = new char[other.m_value_size]; - m_value_size = other.m_value_size; - - std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size, - m_value_buffer); - } - } - - ~value_type_wrapper() { delete[] m_value_buffer; } - - value_type_wrapper(value_type_wrapper &&other) - : m_value_size(0), m_value_buffer(nullptr) { - if (this != &other) { - m_value_buffer = other.m_value_buffer; - m_value_size = other.m_value_size; - - other.m_value_buffer = nullptr; - other.m_value_size = 0; - } - } - - value_type_wrapper &operator=(const value_type_wrapper &other) { - if (this != &other) { - delete[] m_value_buffer; - m_value_buffer = new char[other.m_value_size]; - m_value_size = other.m_value_size; - - std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size, - m_value_buffer); - } - - return *this; - } - - value_type_wrapper &operator=(value_type_wrapper &&other) { - if (this != &other) { - delete[] m_value_buffer; - m_value_buffer = other.m_value_buffer; - m_value_size = other.m_value_size; - - other.m_value_buffer = nullptr; - other.m_value_size = 0; - } - - return *this; - } - - pointer_type pointer() const { - return reinterpret_cast(m_value_buffer); - } - - reference_type reference() const { - return Analysis::Reducer::reference( - reinterpret_cast(m_value_buffer)); - } - }; - - public: - void execute() const { - if (m_policy.end() <= m_policy.begin()) { - if (m_result_ptr) { - typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); - - final_reducer.init(m_result_ptr); - final_reducer.final(m_result_ptr); + void execute_range(const Member i_chunk) const { + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + reference_type update = + Analysis::Reducer::reference(reinterpret_cast( + buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()))); + const auto r = get_chunk_range(i_chunk, m_policy.begin(), + m_policy.chunk_size(), m_policy.end()); + for (Member i = r.begin; i < r.end; ++i) { + if constexpr (std::is_same_v) { + m_functor(i, update); + } else { + m_functor(WorkTag{}, i, update); } - return; } - dispatch_execute_task(this, m_policy.space(), m_force_synchronous); } - inline void execute_task() const { - // See [note 1] for an explanation. - Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( - m_policy.space()); - + void finalize() const { + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); typename Analysis::Reducer final_reducer( &ReducerConditional::select(m_functor, m_reducer)); - - const std::size_t value_size = - Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); - - auto exec = Kokkos::Experimental::HPX::impl_get_executor(); - - using hpx::for_loop; - using hpx::execution::par; - using hpx::execution::static_chunk_size; - -#if KOKKOS_HPX_IMPLEMENTATION == 0 - // NOTE: This version makes the most use of HPX functionality, but - // requires the struct value_type_wrapper to handle different - // reference_types. It is also significantly slower than the version - // below due to not reusing the buffer used by other functions. - using hpx::parallel::reduction; - - value_type_wrapper final_value(value_size); - value_type_wrapper identity(value_size); - - final_reducer.init(final_value.pointer()); - final_reducer.init(identity.pointer()); - - for_loop(par.on(exec).with( - static_chunk_size(get_hpx_adjusted_chunk_size(m_policy))), - m_policy.begin(), m_policy.end(), - reduction(final_value, identity, - [final_reducer]( - value_type_wrapper &a, - value_type_wrapper &b) -> value_type_wrapper & { - final_reducer.join(a.pointer(), b.pointer()); - return a; - }), - [this](Member i, value_type_wrapper &update) { - execute_functor(m_functor, i, update.reference()); - }); - - pointer_type final_value_ptr = final_value.pointer(); - -#elif KOKKOS_HPX_IMPLEMENTATION == 1 - using hpx::for_loop_strided; - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); - - thread_buffer &buffer = m_policy.space().impl_get_buffer(); - buffer.resize(num_worker_threads, value_size); - - for_loop( - par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads, - [&buffer, final_reducer ](const int t) noexcept { - final_reducer.init(reinterpret_cast(buffer.get(t))); - }); - - const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy); - - for_loop_strided( - par.on(exec), m_policy.begin(), m_policy.end(), chunk_size, - [this, &buffer, chunk_size](const Member i_begin) { - reference_type update = Analysis::Reducer::reference( - reinterpret_cast(buffer.get( - Kokkos::Experimental::HPX::impl_hardware_thread_id()))); - const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end()); - execute_functor_range(update, i_begin, i_end); - }); - for (int i = 1; i < num_worker_threads; ++i) { final_reducer.join(reinterpret_cast(buffer.get(0)), reinterpret_cast(buffer.get(i))); @@ -1344,7 +1059,6 @@ class ParallelReduce, ReducerType, pointer_type final_value_ptr = reinterpret_cast(buffer.get(0)); -#endif final_reducer.final(final_value_ptr); @@ -1358,6 +1072,25 @@ class ParallelReduce, ReducerType, } } + void execute() const { + if (m_policy.end() <= m_policy.begin()) { + if (m_result_ptr) { + typename Analysis::Reducer final_reducer( + &ReducerConditional::select(m_functor, m_reducer)); + + final_reducer.init(m_result_ptr); + final_reducer.final(m_result_ptr); + } + return; + } + + const Member num_chunks = + get_num_chunks(m_policy.begin(), m_policy.chunk_size(), m_policy.end()); + m_policy.space().impl_bulk_setup_finalize( + m_force_synchronous, is_light_weight_policy(), *this, + num_chunks, hpx::threads::thread_stacksize::nostack); + } + template inline ParallelReduce( const FunctorType &arg_functor, Policy arg_policy, @@ -1390,8 +1123,8 @@ class ParallelReduce, ReducerType, using WorkRange = typename Policy::WorkRange; using Member = typename Policy::member_type; using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; + Kokkos::Impl::if_c, FunctorType, + ReducerType>; using ReducerTypeFwd = typename ReducerConditional::type; using Analysis = FunctorAnalysis; @@ -1407,94 +1140,69 @@ class ParallelReduce, ReducerType, const Policy m_policy; const ReducerType m_reducer; const pointer_type m_result_ptr; - - bool m_force_synchronous; + const bool m_force_synchronous; public: - void execute() const { - dispatch_execute_task(this, m_iter.m_rp.space(), m_force_synchronous); - } - - inline void execute_task() const { - // See [note 1] for an explanation. - Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( - m_iter.m_rp.space()); - - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + void setup() const { const std::size_t value_size = Analysis::value_size( ReducerConditional::select(m_iter.m_func, m_reducer)); + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); - thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer(); + typename Analysis::Reducer final_reducer( + &ReducerConditional::select(m_iter.m_func, m_reducer)); + hpx_thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer(); buffer.resize(num_worker_threads, value_size); - using hpx::for_loop; - using hpx::execution::par; - using hpx::execution::static_chunk_size; + for (int t = 0; t < num_worker_threads; ++t) { + final_reducer.init(reinterpret_cast(buffer.get(t))); + } + } - auto exec = Kokkos::Experimental::HPX::impl_get_executor(); + void execute_range(const Member i_chunk) const { + hpx_thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer(); + reference_type update = + Analysis::Reducer::reference(reinterpret_cast( + buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()))); + const auto r = get_chunk_range(i_chunk, m_policy.begin(), + m_policy.chunk_size(), m_policy.end()); + for (Member i = r.begin; i < r.end; ++i) { + m_iter(i, update); + } + } + void finalize() const { + hpx_thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer(); typename Analysis::Reducer final_reducer( &ReducerConditional::select(m_iter.m_func, m_reducer)); - -#if KOKKOS_HPX_IMPLEMENTATION == 0 - - for_loop( - par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads, - [&buffer, final_reducer](std::size_t t) { - final_reducer.init(reinterpret_cast(buffer.get(t))); - }); - - for_loop(par.on(exec).with( - static_chunk_size(get_hpx_adjusted_chunk_size(m_policy))), - m_policy.begin(), m_policy.end(), [this, &buffer](const Member i) { - reference_type update = Analysis::Reducer::reference( - reinterpret_cast(buffer.get( - Kokkos::Experimental::HPX::impl_hardware_thread_id()))); - m_iter(i, update); - }); - -#elif KOKKOS_HPX_IMPLEMENTATION == 1 - using hpx::for_loop_strided; - - for_loop( - par.on(exec).with(static_chunk_size(1)), std::size_t(0), - num_worker_threads, [&buffer, final_reducer](const std::size_t t) { - final_reducer.init(reinterpret_cast(buffer.get(t))); - }); - - const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy); - - for_loop_strided( - par.on(exec), m_policy.begin(), m_policy.end(), chunk_size, - [this, &buffer, chunk_size](const Member i_begin) { - reference_type update = Analysis::Reducer::reference( - reinterpret_cast(buffer.get( - Kokkos::Experimental::HPX::impl_hardware_thread_id()))); - const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end()); - - for (Member i = i_begin; i < i_end; ++i) { - m_iter(i, update); - } - }); -#endif - + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); for (int i = 1; i < num_worker_threads; ++i) { final_reducer.join(reinterpret_cast(buffer.get(0)), reinterpret_cast(buffer.get(i))); } - final_reducer.final(reinterpret_cast(buffer.get(0))); + pointer_type final_value_ptr = + reinterpret_cast(buffer.get(0)); + + final_reducer.final(final_value_ptr); if (m_result_ptr != nullptr) { const int n = Analysis::value_count( ReducerConditional::select(m_iter.m_func, m_reducer)); for (int j = 0; j < n; ++j) { - m_result_ptr[j] = reinterpret_cast(buffer.get(0))[j]; + m_result_ptr[j] = final_value_ptr[j]; } } } + void execute() const { + const Member num_chunks = + get_num_chunks(m_policy.begin(), m_policy.chunk_size(), m_policy.end()); + m_iter.m_rp.space().impl_bulk_setup_finalize( + m_force_synchronous, is_light_weight_policy(), *this, + num_chunks, hpx::threads::thread_stacksize::nostack); + } + template inline ParallelReduce( const FunctorType &arg_functor, MDRangePolicy arg_policy, @@ -1544,97 +1252,89 @@ class ParallelScan, using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; using value_type = typename Analysis::value_type; + using barrier_type = hpx::barrier<>; const FunctorType m_functor; const Policy m_policy; - template - inline static std::enable_if_t::value> - execute_functor_range(const FunctorType &functor, const Member i_begin, - const Member i_end, reference_type update, - const bool final) { - for (Member i = i_begin; i < i_end; ++i) { - functor(i, update, final); - } + public: + void setup() const { + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const std::size_t value_size = Analysis::value_size(m_functor); + + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + buffer.resize(num_worker_threads, 2 * value_size, sizeof(barrier_type)); + + new (buffer.get_extra_space()) barrier_type(num_worker_threads); } - template - inline static std::enable_if_t::value> - execute_functor_range(const FunctorType &functor, const Member i_begin, - const Member i_end, reference_type update, - const bool final) { - const TagType t{}; + void execute_chunk(const Member i_begin, const Member i_end, + reference_type update, const bool final) const { for (Member i = i_begin; i < i_end; ++i) { - functor(t, i, update, final); + if constexpr (std::is_same_v) { + m_functor(i, update, final); + } else { + m_functor(WorkTag{}, i, update, final); + } } } - public: - void execute() const { dispatch_execute_task(this, m_policy.space()); } - - inline void execute_task() const { - // See [note 1] for an explanation. - Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( - m_policy.space()); - + void execute_range(int t) const { const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); const int value_count = Analysis::value_count(m_functor); const std::size_t value_size = Analysis::value_size(m_functor); - thread_buffer &buffer = m_policy.space().impl_get_buffer(); - buffer.resize(num_worker_threads, 2 * value_size); + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + typename Analysis::Reducer final_reducer(&m_functor); + barrier_type &barrier = + *static_cast(buffer.get_extra_space()); + reference_type update_sum = + final_reducer.init(reinterpret_cast(buffer.get(t))); + + const WorkRange range(m_policy, t, num_worker_threads); + execute_chunk(range.begin(), range.end(), update_sum, false); - using hpx::barrier; - using hpx::for_loop; - using hpx::execution::par; - using hpx::execution::static_chunk_size; + barrier.arrive_and_wait(); - barrier<> bar(num_worker_threads); - auto exec = Kokkos::Experimental::HPX::impl_get_executor(); + if (t == 0) { + final_reducer.init(reinterpret_cast( + static_cast(buffer.get(0)) + value_size)); - typename Analysis::Reducer final_reducer(&m_functor); + for (int i = 1; i < num_worker_threads; ++i) { + pointer_type ptr_1_prev = + reinterpret_cast(buffer.get(i - 1)); + pointer_type ptr_2_prev = reinterpret_cast( + static_cast(buffer.get(i - 1)) + value_size); + pointer_type ptr_2 = reinterpret_cast( + static_cast(buffer.get(i)) + value_size); - for_loop( - par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads, - [this, &bar, &buffer, num_worker_threads, value_count, value_size, - final_reducer](int t) { - reference_type update_sum = - final_reducer.init(reinterpret_cast(buffer.get(t))); - - const WorkRange range(m_policy, t, num_worker_threads); - execute_functor_range(m_functor, range.begin(), range.end(), - update_sum, false); - - bar.arrive_and_wait(); - - if (t == 0) { - final_reducer.init( - reinterpret_cast(buffer.get(0) + value_size)); - - for (int i = 1; i < num_worker_threads; ++i) { - pointer_type ptr_1_prev = - reinterpret_cast(buffer.get(i - 1)); - pointer_type ptr_2_prev = reinterpret_cast( - buffer.get(i - 1) + value_size); - pointer_type ptr_2 = - reinterpret_cast(buffer.get(i) + value_size); - - for (int j = 0; j < value_count; ++j) { - ptr_2[j] = ptr_2_prev[j]; - } - - final_reducer.join(ptr_2, ptr_1_prev); - } - } + for (int j = 0; j < value_count; ++j) { + ptr_2[j] = ptr_2_prev[j]; + } + + final_reducer.join(ptr_2, ptr_1_prev); + } + } - bar.arrive_and_wait(); + barrier.arrive_and_wait(); - reference_type update_base = Analysis::Reducer::reference( - reinterpret_cast(buffer.get(t) + value_size)); + reference_type update_base = + Analysis::Reducer::reference(reinterpret_cast( + static_cast(buffer.get(t)) + value_size)); - execute_functor_range(m_functor, range.begin(), range.end(), - update_base, true); - }); + execute_chunk(range.begin(), range.end(), update_base, true); + } + + void finalize() const { + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + static_cast(buffer.get_extra_space())->~barrier_type(); + } + + void execute() const { + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + m_policy.space().impl_bulk_setup_finalize( + false, is_light_weight_policy(), *this, num_worker_threads, + hpx::threads::thread_stacksize::small_); } inline ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy) @@ -1654,102 +1354,94 @@ class ParallelScanWithTotal, using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; using value_type = typename Analysis::value_type; + using barrier_type = hpx::barrier<>; const FunctorType m_functor; const Policy m_policy; pointer_type m_result_ptr; - template - inline static std::enable_if_t::value> - execute_functor_range(const FunctorType &functor, const Member i_begin, - const Member i_end, reference_type update, - const bool final) { - for (Member i = i_begin; i < i_end; ++i) { - functor(i, update, final); - } + public: + void setup() const { + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const std::size_t value_size = Analysis::value_size(m_functor); + + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + buffer.resize(num_worker_threads, 2 * value_size, sizeof(barrier_type)); + + new (buffer.get_extra_space()) barrier_type(num_worker_threads); } - template - inline static std::enable_if_t::value> - execute_functor_range(const FunctorType &functor, const Member i_begin, - const Member i_end, reference_type update, - const bool final) { - const TagType t{}; + void execute_chunk(const Member i_begin, const Member i_end, + reference_type update, const bool final) const { for (Member i = i_begin; i < i_end; ++i) { - functor(t, i, update, final); + if constexpr (std::is_same_v) { + m_functor(i, update, final); + } else { + m_functor(WorkTag{}, i, update, final); + } } } - public: - void execute() const { dispatch_execute_task(this, m_policy.space()); } - - inline void execute_task() const { - // See [note 1] for an explanation. - Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( - m_policy.space()); - + void execute_range(int t) const { const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); const int value_count = Analysis::value_count(m_functor); const std::size_t value_size = Analysis::value_size(m_functor); - thread_buffer &buffer = m_policy.space().impl_get_buffer(); - buffer.resize(num_worker_threads, 2 * value_size); + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + typename Analysis::Reducer final_reducer(&m_functor); + barrier_type &barrier = + *static_cast(buffer.get_extra_space()); + reference_type update_sum = + final_reducer.init(reinterpret_cast(buffer.get(t))); - using hpx::barrier; - using hpx::for_loop; - using hpx::execution::par; - using hpx::execution::static_chunk_size; + const WorkRange range(m_policy, t, num_worker_threads); + execute_chunk(range.begin(), range.end(), update_sum, false); - barrier<> bar(num_worker_threads); - auto exec = Kokkos::Experimental::HPX::impl_get_executor(); + barrier.arrive_and_wait(); - typename Analysis::Reducer final_reducer(&m_functor); + if (t == 0) { + final_reducer.init(reinterpret_cast( + static_cast(buffer.get(0)) + value_size)); - for_loop( - par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads, - [this, &bar, &buffer, num_worker_threads, value_count, value_size, - final_reducer](int t) { - reference_type update_sum = - final_reducer.init(reinterpret_cast(buffer.get(t))); - - const WorkRange range(m_policy, t, num_worker_threads); - execute_functor_range(m_functor, range.begin(), range.end(), - update_sum, false); - - bar.arrive_and_wait(); - - if (t == 0) { - final_reducer.init( - reinterpret_cast(buffer.get(0) + value_size)); - - for (int i = 1; i < num_worker_threads; ++i) { - pointer_type ptr_1_prev = - reinterpret_cast(buffer.get(i - 1)); - pointer_type ptr_2_prev = reinterpret_cast( - buffer.get(i - 1) + value_size); - pointer_type ptr_2 = - reinterpret_cast(buffer.get(i) + value_size); - - for (int j = 0; j < value_count; ++j) { - ptr_2[j] = ptr_2_prev[j]; - } - - final_reducer.join(ptr_2, ptr_1_prev); - } - } + for (int i = 1; i < num_worker_threads; ++i) { + pointer_type ptr_1_prev = + reinterpret_cast(buffer.get(i - 1)); + pointer_type ptr_2_prev = reinterpret_cast( + static_cast(buffer.get(i - 1)) + value_size); + pointer_type ptr_2 = reinterpret_cast( + static_cast(buffer.get(i)) + value_size); - bar.arrive_and_wait(); + for (int j = 0; j < value_count; ++j) { + ptr_2[j] = ptr_2_prev[j]; + } - reference_type update_base = Analysis::Reducer::reference( - reinterpret_cast(buffer.get(t) + value_size)); + final_reducer.join(ptr_2, ptr_1_prev); + } + } - execute_functor_range(m_functor, range.begin(), range.end(), - update_base, true); + barrier.arrive_and_wait(); - if (t == num_worker_threads - 1) { - *m_result_ptr = update_base; - } - }); + reference_type update_base = + Analysis::Reducer::reference(reinterpret_cast( + static_cast(buffer.get(t)) + value_size)); + + execute_chunk(range.begin(), range.end(), update_base, true); + + if (t == num_worker_threads - 1) { + *m_result_ptr = update_base; + } + } + + void finalize() const { + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + static_cast(buffer.get_extra_space())->~barrier_type(); + } + + void execute() const { + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + m_policy.space().impl_bulk_setup_finalize( + false, is_light_weight_policy(), *this, num_worker_threads, + hpx::threads::thread_stacksize::small_); } template @@ -1784,92 +1476,37 @@ class ParallelFor, const int m_league; const std::size_t m_shared; - template - inline static std::enable_if_t::value> execute_functor( - const FunctorType &functor, const Policy &policy, const int league_rank, - char *local_buffer, const std::size_t local_buffer_size) { - functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size)); - } - - template - inline static std::enable_if_t::value> execute_functor( - const FunctorType &functor, const Policy &policy, const int league_rank, - char *local_buffer, const std::size_t local_buffer_size) { - const TagType t{}; - functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size)); - } - - template - inline static std::enable_if_t::value> - execute_functor_range(const FunctorType &functor, const Policy &policy, - const int league_rank_begin, const int league_rank_end, - char *local_buffer, - const std::size_t local_buffer_size) { - for (int league_rank = league_rank_begin; league_rank < league_rank_end; - ++league_rank) { - functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size)); - } + public: + void setup() const { + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + buffer.resize(num_worker_threads, m_shared); } - template - inline static std::enable_if_t::value> - execute_functor_range(const FunctorType &functor, const Policy &policy, - const int league_rank_begin, const int league_rank_end, - char *local_buffer, - const std::size_t local_buffer_size) { - const TagType t{}; - for (int league_rank = league_rank_begin; league_rank < league_rank_end; - ++league_rank) { - functor(t, - Member(policy, 0, league_rank, local_buffer, local_buffer_size)); + void execute_range(const int i) const { + const int t = Kokkos::Experimental::HPX::impl_hardware_thread_id(); + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + const auto r = + get_chunk_range(i, 0, m_policy.chunk_size(), m_policy.league_size()); + for (int league_rank = r.begin; league_rank < r.end; ++league_rank) { + if constexpr (std::is_same_v) { + m_functor(Member(m_policy, 0, league_rank, buffer.get(t), m_shared)); + } else { + m_functor(WorkTag{}, + Member(m_policy, 0, league_rank, buffer.get(t), m_shared)); + } } } - public: - void execute() const { dispatch_execute_task(this, m_policy.space()); } - - inline void execute_task() const { - // See [note 1] for an explanation. - Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( - m_policy.space()); + void finalize() const {} - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); - - thread_buffer &buffer = m_policy.space().impl_get_buffer(); - buffer.resize(num_worker_threads, m_shared); - - auto exec = Kokkos::Experimental::HPX::impl_get_executor(); - - using hpx::execution::par; - using hpx::execution::static_chunk_size; - -#if KOKKOS_HPX_IMPLEMENTATION == 0 - using hpx::for_loop; - - for_loop( - par.on(exec).with(static_chunk_size(m_policy.chunk_size())), 0, - m_policy.league_size(), [this, &buffer](const int league_rank) { - execute_functor( - m_functor, m_policy, league_rank, - buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()), - m_shared); - }); - -#elif KOKKOS_HPX_IMPLEMENTATION == 1 - using hpx::for_loop_strided; - - for_loop_strided( - par.on(exec), 0, m_policy.league_size(), m_policy.chunk_size(), - [this, &buffer](const int league_rank_begin) { - const int league_rank_end = - (std::min)(league_rank_begin + m_policy.chunk_size(), - m_policy.league_size()); - execute_functor_range( - m_functor, m_policy, league_rank_begin, league_rank_end, - buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()), - m_shared); - }); -#endif + void execute() const { + const int num_chunks = + get_num_chunks(0, m_policy.chunk_size(), m_policy.league_size()); + m_policy.space().impl_bulk_setup_finalize( + false, is_light_weight_policy(), *this, num_chunks, + hpx::threads::thread_stacksize::nostack); } ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) @@ -1889,8 +1526,8 @@ class ParallelReduce, using Member = typename Policy::member_type; using WorkTag = typename Policy::work_tag; using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; + Kokkos::Impl::if_c, FunctorType, + ReducerType>; using ReducerTypeFwd = typename ReducerConditional::type; using Analysis = FunctorAnalysis; @@ -1904,136 +1541,51 @@ class ParallelReduce, const ReducerType m_reducer; pointer_type m_result_ptr; const std::size_t m_shared; - - bool m_force_synchronous; - - template - inline static std::enable_if_t::value> execute_functor( - const FunctorType &functor, const Policy &policy, const int league_rank, - char *local_buffer, const std::size_t local_buffer_size, - reference_type update) { - functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size), - update); - } - - template - inline static std::enable_if_t::value> execute_functor( - const FunctorType &functor, const Policy &policy, const int league_rank, - char *local_buffer, const std::size_t local_buffer_size, - reference_type update) { - const TagType t{}; - functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size), - update); - } - - template - inline static std::enable_if_t::value> - execute_functor_range(const FunctorType &functor, const Policy &policy, - const int league_rank_begin, const int league_rank_end, - char *local_buffer, const std::size_t local_buffer_size, - reference_type update) { - for (int league_rank = league_rank_begin; league_rank < league_rank_end; - ++league_rank) { - functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size), - update); - } - } - - template - inline static std::enable_if_t::value> - execute_functor_range(const FunctorType &functor, const Policy &policy, - const int league_rank_begin, const int league_rank_end, - char *local_buffer, const std::size_t local_buffer_size, - reference_type update) { - const TagType t{}; - for (int league_rank = league_rank_begin; league_rank < league_rank_end; - ++league_rank) { - functor(t, - Member(policy, 0, league_rank, local_buffer, local_buffer_size), - update); - } - } + const bool m_force_synchronous; public: - void execute() const { - if (m_policy.league_size() * m_policy.team_size() == 0) { - if (m_result_ptr) { - typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); - final_reducer.init(m_result_ptr); - final_reducer.final(m_result_ptr); - } - return; - } - dispatch_execute_task(this, m_policy.space()); - } - - inline void execute_task() const { - // See [note 1] for an explanation. - Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( - m_policy.space()); - - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + void setup() const { const std::size_t value_size = Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); - thread_buffer &buffer = m_policy.space().impl_get_buffer(); + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); buffer.resize(num_worker_threads, value_size + m_shared); + typename Analysis::Reducer final_reducer( + &ReducerConditional::select(m_functor, m_reducer)); - auto exec = Kokkos::Experimental::HPX::impl_get_executor(); + for (int t = 0; t < num_worker_threads; ++t) { + final_reducer.init(reinterpret_cast(buffer.get(t))); + } + } - using hpx::for_loop; - using hpx::execution::par; - using hpx::execution::static_chunk_size; + void execute_range(const int i) const { + const std::size_t value_size = + Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id(); + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + reference_type update = Analysis::Reducer::reference( + reinterpret_cast(buffer.get(t))); + const auto r = + get_chunk_range(i, 0, m_policy.chunk_size(), m_policy.league_size()); + char *local_buffer = static_cast(buffer.get(t)) + value_size; + for (int league_rank = r.begin; league_rank < r.end; ++league_rank) { + if constexpr (std::is_same_v) { + m_functor(Member(m_policy, 0, league_rank, local_buffer, m_shared), + update); + } else { + m_functor(WorkTag{}, + Member(m_policy, 0, league_rank, local_buffer, m_shared), + update); + } + } + } + void finalize() const { + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); typename Analysis::Reducer final_reducer( &ReducerConditional::select(m_functor, m_reducer)); - -#if KOKKOS_HPX_IMPLEMENTATION == 0 - - for_loop( - par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads, - [&buffer, final_reducer](const std::size_t t) { - final_reducer.init(reinterpret_cast(buffer.get(t))); - }); - - for_loop(par.on(exec).with(static_chunk_size(m_policy.chunk_size())), 0, - m_policy.league_size(), - [this, &buffer, value_size](const int league_rank) { - std::size_t t = - Kokkos::Experimental::HPX::impl_hardware_thread_id(); - reference_type update = Analysis::Reducer::reference( - reinterpret_cast(buffer.get(t))); - - execute_functor(m_functor, m_policy, league_rank, - buffer.get(t) + value_size, m_shared, - update); - }); - -#elif KOKKOS_HPX_IMPLEMENTATION == 1 - using hpx::for_loop_strided; - - for_loop( - par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads, - [&buffer, final_reducer](std::size_t const t) { - final_reducer.init(reinterpret_cast(buffer.get(t))); - }); - - for_loop_strided( - par.on(exec), 0, m_policy.league_size(), m_policy.chunk_size(), - [this, &buffer, value_size](int const league_rank_begin) { - std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id(); - reference_type update = Analysis::Reducer::reference( - reinterpret_cast(buffer.get(t))); - const int league_rank_end = - (std::min)(league_rank_begin + m_policy.chunk_size(), - m_policy.league_size()); - execute_functor_range( - m_functor, m_policy, league_rank_begin, league_rank_end, - buffer.get(t) + value_size, m_shared, update); - }); -#endif - + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); const pointer_type ptr = reinterpret_cast(buffer.get(0)); for (int t = 1; t < num_worker_threads; ++t) { final_reducer.join(ptr, reinterpret_cast(buffer.get(t))); @@ -2051,6 +1603,24 @@ class ParallelReduce, } } + void execute() const { + if (m_policy.league_size() * m_policy.team_size() == 0) { + if (m_result_ptr) { + typename Analysis::Reducer final_reducer( + &ReducerConditional::select(m_functor, m_reducer)); + final_reducer.init(m_result_ptr); + final_reducer.final(m_result_ptr); + } + return; + } + + const int num_chunks = + get_num_chunks(0, m_policy.chunk_size(), m_policy.league_size()); + m_policy.space().impl_bulk_setup_finalize( + m_force_synchronous, is_light_weight_policy(), *this, + num_chunks, hpx::threads::thread_stacksize::nostack); + } + template ParallelReduce(const FunctorType &arg_functor, const Policy &arg_policy, const ViewType &arg_result, diff --git a/core/src/impl/Kokkos_ExecSpaceManager.hpp b/core/src/impl/Kokkos_ExecSpaceManager.hpp index f0edc8ac47..58ed54275a 100644 --- a/core/src/impl/Kokkos_ExecSpaceManager.hpp +++ b/core/src/impl/Kokkos_ExecSpaceManager.hpp @@ -102,9 +102,7 @@ constexpr bool check_valid_execution_space() { static_assert(is_detected_v); static_assert(is_detected_v); static_assert(is_detected_v); -#ifndef KOKKOS_ENABLE_HPX // FIXME_HPX static_assert(sizeof(ExecutionSpace) <= 2 * sizeof(void*)); -#endif return true; } diff --git a/core/src/impl/Kokkos_HostSpace_deepcopy.cpp b/core/src/impl/Kokkos_HostSpace_deepcopy.cpp index 096dfd6b7f..0ce5fe34b6 100644 --- a/core/src/impl/Kokkos_HostSpace_deepcopy.cpp +++ b/core/src/impl/Kokkos_HostSpace_deepcopy.cpp @@ -52,7 +52,8 @@ void hostspace_parallel_deepcopy_async(const DefaultHostExecutionSpace& exec, // synchronously. The deep copy must be correctly sequenced with respect to // other kernels submitted to the same instance, so we only use the fallback // parallel_for version in this case. -#if !(defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)) +#if !(defined(KOKKOS_ENABLE_HPX) && \ + defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH)) constexpr int host_deep_copy_serial_limit = 10 * 8192; if ((n < host_deep_copy_serial_limit) || (DefaultHostExecutionSpace().concurrency() == 1)) { diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 74be49266f..b18a23f986 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -638,6 +638,7 @@ if(Kokkos_ENABLE_HPX) hpx/TestHPX_IndependentInstancesDelayedExecution.cpp hpx/TestHPX_IndependentInstancesInstanceIds.cpp hpx/TestHPX_IndependentInstancesRefCounting.cpp + hpx/TestHPX_IndependentInstancesSynchronization.cpp ) endif() diff --git a/core/unit_test/TestViewMapping_a.hpp b/core/unit_test/TestViewMapping_a.hpp index 9df044ec7a..e85dfd0472 100644 --- a/core/unit_test/TestViewMapping_a.hpp +++ b/core/unit_test/TestViewMapping_a.hpp @@ -1038,10 +1038,7 @@ void test_view_mapping() { ASSERT_EQ(a.use_count(), 1); ASSERT_EQ(b.use_count(), 0); -// TODO: a.use_count() and x.use_count() are 0 with the asynchronous HPX -// backend. Why? -#if !defined(KOKKOS_ENABLE_CUDA_LAMBDA) && \ - !(defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)) +#if !defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_ENABLE_CUDA_LAMBDA) // Cannot launch host lambda when CUDA lambda is enabled. using host_exec_space = diff --git a/core/unit_test/hpx/TestHPX_IndependentInstances.cpp b/core/unit_test/hpx/TestHPX_IndependentInstances.cpp index 8f349cf804..7ca48afeff 100644 --- a/core/unit_test/hpx/TestHPX_IndependentInstances.cpp +++ b/core/unit_test/hpx/TestHPX_IndependentInstances.cpp @@ -20,7 +20,6 @@ #include #include -#ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH #ifndef HPX_COMPUTE_DEVICE_CODE namespace { @@ -99,7 +98,7 @@ TEST(hpx, independent_instances) { Kokkos::Experimental::WorkItemProperty::HintLightWeight), FunctorInitConstant(v1, c)); - Kokkos::Experimental::HPX hpx2(hpx1.impl_get_future()); + Kokkos::Experimental::HPX hpx2(hpx1.get_sender()); Kokkos::parallel_for( "Test::hpx::independent_instances::add", Kokkos::Experimental::require( @@ -107,7 +106,7 @@ TEST(hpx, independent_instances) { Kokkos::Experimental::WorkItemProperty::HintLightWeight), FunctorAdd(v1, v2, d)); - Kokkos::Experimental::HPX hpx3(hpx1.impl_get_future()); + Kokkos::Experimental::HPX hpx3(hpx1.get_sender()); Kokkos::parallel_for( "Test::hpx::independent_instances::add_index", Kokkos::Experimental::require( @@ -115,12 +114,8 @@ TEST(hpx, independent_instances) { Kokkos::Experimental::WorkItemProperty::HintLightWeight), FunctorAddIndex(v1, v3)); - // NOTE: This monstrosity is used to collapse a future, - // future>> (return type of when_all) into a future which is - // ready whenever the un-collapsed future would've been ready. HPX does not - // currently have the functionality to collapse this automatically. - Kokkos::Experimental::HPX hpx4(hpx::get<0>(hpx::split_future( - hpx::when_all(hpx2.impl_get_future(), hpx3.impl_get_future())))); + Kokkos::Experimental::HPX hpx4(hpx::execution::experimental::when_all( + hpx2.get_sender(), hpx3.get_sender())); Kokkos::parallel_for( "Test::hpx::independent_instances::pointwise_sum", Kokkos::Experimental::require( @@ -137,11 +132,6 @@ TEST(hpx, independent_instances) { hpx4.fence(); - ASSERT_EQ(true, hpx1.impl_get_future().is_ready()); - ASSERT_EQ(true, hpx2.impl_get_future().is_ready()); - ASSERT_EQ(true, hpx3.impl_get_future().is_ready()); - ASSERT_EQ(true, hpx4.impl_get_future().is_ready()); - const int expected_sum = n * (2 * c + d) + (n * (n - 1) / 2); ASSERT_EQ(expected_sum, sum_v()); } @@ -149,4 +139,3 @@ TEST(hpx, independent_instances) { } // namespace #endif -#endif diff --git a/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp b/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp index 177d87c1c8..3759d85438 100644 --- a/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp +++ b/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp @@ -17,18 +17,20 @@ #include #include -#include - -#ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH +#include namespace { TEST(hpx, independent_instances_delayed_execution) { Kokkos::View ran("ran"); - hpx::lcos::local::promise p; - hpx::shared_future f = p.get_future(); - Kokkos::Experimental::HPX hpx(f); + // Create a sender that will call set_value on a receiver after a delay. + hpx::execution::experimental::unique_any_sender<> s{ + hpx::execution::experimental::schedule( + hpx::execution::experimental::thread_pool_scheduler{}) | + hpx::execution::experimental::then( + [] { hpx::this_thread::sleep_for(std::chrono::milliseconds(500)); })}; + Kokkos::Experimental::HPX hpx(std::move(s)); Kokkos::parallel_for( "Test::hpx::independent_instances::delay_execution", Kokkos::Experimental::require( @@ -36,15 +38,13 @@ TEST(hpx, independent_instances_delayed_execution) { Kokkos::Experimental::WorkItemProperty::HintLightWeight), KOKKOS_LAMBDA(int) { ran() = true; }); +#if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) ASSERT_FALSE(ran()); - ASSERT_FALSE(hpx.impl_get_future().is_ready()); - - p.set_value(); - +#else + ASSERT_TRUE(ran()); +#endif hpx.fence(); - ASSERT_TRUE(hpx.impl_get_future().is_ready()); + ASSERT_TRUE(ran()); } } // namespace - -#endif diff --git a/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp b/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp index 6441666e64..2c9b50f049 100644 --- a/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp +++ b/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp @@ -17,9 +17,7 @@ #include #include -#include - -#ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH +#include namespace { @@ -59,25 +57,24 @@ TEST(hpx, independent_instances_instance_ids) { ASSERT_EQ(hpx_independent1.impl_instance_id(), hpx_independent4.impl_instance_id()); - hpx::shared_future f = hpx::make_ready_future(); - Kokkos::Experimental::HPX hpx_independent_future1(f); - Kokkos::Experimental::HPX hpx_independent_future2 = hpx_independent_future1; - Kokkos::Experimental::HPX hpx_independent_future3{hpx_independent_future1}; - Kokkos::Experimental::HPX hpx_independent_future4; - hpx_independent_future4 = hpx_independent_future1; + Kokkos::Experimental::HPX hpx_independent_sender1( + hpx::execution::experimental::unique_any_sender<>{ + hpx::execution::experimental::just()}); + Kokkos::Experimental::HPX hpx_independent_sender2 = hpx_independent_sender1; + Kokkos::Experimental::HPX hpx_independent_sender3{hpx_independent_sender1}; + Kokkos::Experimental::HPX hpx_independent_sender4; + hpx_independent_sender4 = hpx_independent_sender1; ASSERT_NE(hpx_default1.impl_instance_id(), - hpx_independent1.impl_instance_id()); + hpx_independent_sender1.impl_instance_id()); ASSERT_NE(hpx_independent1.impl_instance_id(), - hpx_independent_future1.impl_instance_id()); - ASSERT_EQ(hpx_independent_future1.impl_instance_id(), - hpx_independent_future2.impl_instance_id()); - ASSERT_EQ(hpx_independent_future1.impl_instance_id(), - hpx_independent_future3.impl_instance_id()); - ASSERT_EQ(hpx_independent_future1.impl_instance_id(), - hpx_independent_future4.impl_instance_id()); + hpx_independent_sender1.impl_instance_id()); + ASSERT_EQ(hpx_independent_sender1.impl_instance_id(), + hpx_independent_sender2.impl_instance_id()); + ASSERT_EQ(hpx_independent_sender1.impl_instance_id(), + hpx_independent_sender3.impl_instance_id()); + ASSERT_EQ(hpx_independent_sender1.impl_instance_id(), + hpx_independent_sender4.impl_instance_id()); } } // namespace - -#endif diff --git a/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp b/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp index 2e9ca081bc..3fc7d83909 100644 --- a/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp +++ b/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp @@ -17,13 +17,12 @@ #include #include -#ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH - namespace { std::atomic dummy_count; struct dummy { dummy() { ++dummy_count; } + dummy(dummy &&) { ++dummy_count; } dummy(dummy const &) { ++dummy_count; } ~dummy() { --dummy_count; } void f() const {} @@ -32,28 +31,26 @@ struct dummy { // This test makes sure the independent HPX instances don't hold on to captured // data after destruction. TEST(hpx, independent_instances_reference_counting) { - dummy d; - Kokkos::Experimental::HPX hpx( - Kokkos::Experimental::HPX::instance_mode::independent); - Kokkos::parallel_for( - "Test::hpx::reference_counting::dummy", - Kokkos::RangePolicy(hpx, 0, 1), - KOKKOS_LAMBDA(int) { - // Make sure dummy struct is captured. - d.f(); - }); - - hpx.fence(); - - // The fence above makes sure that copies of dummy get released. However, - // all copies are not guaranteed to be released as soon as fence returns. - // Therefore we wait for a short time to make it almost guaranteed that all - // copies have been released. - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - - ASSERT_EQ(1, dummy_count); + ASSERT_EQ(0, dummy_count); + + { + dummy d; + ASSERT_EQ(1, dummy_count); + Kokkos::Experimental::HPX hpx( + Kokkos::Experimental::HPX::instance_mode::independent); + Kokkos::parallel_for( + "Test::hpx::reference_counting::dummy", + Kokkos::RangePolicy(hpx, 0, 1), + KOKKOS_LAMBDA(int) { + // Make sure dummy struct is captured. + d.f(); + }); + + hpx.fence(); + ASSERT_EQ(1, dummy_count); + } + + ASSERT_EQ(0, dummy_count); } } // namespace - -#endif diff --git a/core/unit_test/hpx/TestHPX_IndependentInstancesSynchronization.cpp b/core/unit_test/hpx/TestHPX_IndependentInstancesSynchronization.cpp new file mode 100644 index 0000000000..24eb642f6b --- /dev/null +++ b/core/unit_test/hpx/TestHPX_IndependentInstancesSynchronization.cpp @@ -0,0 +1,162 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +// These tests specifically check that work dispatched to independent instances +// is synchronized correctly on fences. A previous bug that this protects +// against is work being mistakenly dispatched to the default instance, but the +// fence fencing the independent instance. In that case these tests will fail. + +namespace { +inline constexpr int n = 1 << 10; + +TEST(hpx, independent_instances_synchronization_parallel_for_range_policy) { + Kokkos::View a("a", n); + + Kokkos::Experimental::HPX instance{ + Kokkos::Experimental::HPX::instance_mode::independent}; + Kokkos::RangePolicy policy(instance, 0, n); + Kokkos::parallel_for( + "parallel_for_range_policy", policy, + KOKKOS_LAMBDA(const auto i) { a[i] = i; }); + + instance.fence(); + + for (int i = 0; i < n; ++i) { + ASSERT_EQ(a[i], i); + } +} + +TEST(hpx, independent_instances_synchronization_parallel_for_mdrange_policy) { + Kokkos::View a("a", n); + + Kokkos::Experimental::HPX instance{ + Kokkos::Experimental::HPX::instance_mode::independent}; + Kokkos::MDRangePolicy> policy( + instance, {{0, 0}}, {{n, 1}}); + Kokkos::parallel_for( + "parallel_for_mdrange_policy", policy, + KOKKOS_LAMBDA(const auto i, const auto) { a[i] = i; }); + + instance.fence(); + + for (int i = 0; i < n; ++i) { + ASSERT_EQ(a[i], i); + } +} + +TEST(hpx, independent_instances_synchronization_parallel_for_team_policy) { + Kokkos::View a("a", n); + + Kokkos::Experimental::HPX instance{ + Kokkos::Experimental::HPX::instance_mode::independent}; + Kokkos::TeamPolicy policy(instance, n, 1); + Kokkos::parallel_for( + "parallel_for_team_policy", policy, KOKKOS_LAMBDA(const auto &handle) { + a[handle.league_rank()] = handle.league_rank(); + }); + + instance.fence(); + + for (int i = 0; i < n; ++i) { + ASSERT_EQ(a[i], i); + } +} + +TEST(hpx, independent_instances_synchronization_parallel_reduce_range_policy) { + Kokkos::View a("a", n); + Kokkos::View b("b"); + + Kokkos::Experimental::HPX instance{ + Kokkos::Experimental::HPX::instance_mode::independent}; + Kokkos::RangePolicy policy(instance, 0, n); + Kokkos::parallel_reduce( + "parallel_reduce_range_policy", policy, + KOKKOS_LAMBDA(const int i, int &) { a[i] = i; }, b); + + instance.fence(); + + for (int i = 0; i < n; ++i) { + ASSERT_EQ(a[i], i); + } +} + +TEST(hpx, + independent_instances_synchronization_parallel_reduce_mdrange_policy) { + Kokkos::View a("a", n); + Kokkos::View b("b"); + + Kokkos::Experimental::HPX instance{ + Kokkos::Experimental::HPX::instance_mode::independent}; + Kokkos::MDRangePolicy> policy( + instance, {{0, 0}}, {{n, 1}}); + Kokkos::parallel_reduce( + "parallel_reduce_mdrange_policy", policy, + KOKKOS_LAMBDA(const int i, const int, int &) { a[i] = i; }, b); + + instance.fence(); + + for (int i = 0; i < n; ++i) { + ASSERT_EQ(a[i], i); + } +} + +TEST(hpx, independent_instances_synchronization_parallel_reduce_team_policy) { + Kokkos::View a("a", n); + Kokkos::View b("b"); + + Kokkos::Experimental::HPX instance{ + Kokkos::Experimental::HPX::instance_mode::independent}; + Kokkos::TeamPolicy policy(instance, n, 1); + Kokkos::parallel_reduce( + "parallel_reduce_team_policy", policy, + KOKKOS_LAMBDA(const decltype(policy)::member_type &handle, int &) { + a[handle.league_rank()] = handle.league_rank(); + }, + b); + + instance.fence(); + + for (int i = 0; i < n; ++i) { + ASSERT_EQ(a[i], i); + } +} + +TEST(hpx, independent_instances_synchronization_parallel_scan_range_policy) { + Kokkos::View a("a", n); + Kokkos::View b("b", n); + + Kokkos::Experimental::HPX instance{ + Kokkos::Experimental::HPX::instance_mode::independent}; + Kokkos::RangePolicy policy(instance, 0, n); + Kokkos::parallel_scan( + "parallel_scan_range_policy", policy, + KOKKOS_LAMBDA(const int i, int &, bool final) { + if (!final) { + a[i] = i; + } + }, + b); + + instance.fence(); + + for (int i = 0; i < n; ++i) { + ASSERT_EQ(a[i], i); + } +} +} // namespace diff --git a/core/unit_test/tools/TestEventCorrectness.hpp b/core/unit_test/tools/TestEventCorrectness.hpp index ec8cc45a38..3c85f661aa 100644 --- a/core/unit_test/tools/TestEventCorrectness.hpp +++ b/core/unit_test/tools/TestEventCorrectness.hpp @@ -393,6 +393,13 @@ TEST(kokkosp, parallel_scan_no_fence) { #ifdef KOKKOS_ENABLE_THREADS if (std::is_same::value) GTEST_SKIP() << "skipping since the Thread backend always fences"; +#endif +#if defined(KOKKOS_ENABLE_HPX) && \ + !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) + if (std::is_same::value) + GTEST_SKIP() << "skipping since the HPX backend always fences with async " + "dispatch disabled"; #endif // FIXME_OPENMPTARGET #ifdef KOKKOS_ENABLE_OPENMPTARGET @@ -427,6 +434,13 @@ TEST(kokkosp, parallel_scan_no_fence_view) { #ifdef KOKKOS_ENABLE_THREADS if (std::is_same::value) GTEST_SKIP() << "skipping since the Thread backend always fences"; +#endif +#if defined(KOKKOS_ENABLE_HPX) && \ + !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) + if (std::is_same::value) + GTEST_SKIP() << "skipping since the HPX backend always fences with async " + "dispatch disabled"; #endif // FIXME_OPENMPTARGET #ifdef KOKKOS_ENABLE_OPENMPTARGET From 5bb7e0a5466bdf76075884ec49f83b611724d831 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Feb 2023 12:26:32 -0500 Subject: [PATCH 154/496] Fixup deprecated code 3 code path OpenMP::impl_thread_pool_size --- core/src/OpenMP/Kokkos_OpenMP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/OpenMP/Kokkos_OpenMP.cpp b/core/src/OpenMP/Kokkos_OpenMP.cpp index 687f6e3c5d..6185c32ac8 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -100,7 +100,7 @@ bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { int OpenMP::impl_thread_pool_size() const noexcept { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return OpenMP::in_parallel(exec_space) + return OpenMP::in_parallel() ? omp_get_num_threads() : (Impl::t_openmp_instance ? Impl::t_openmp_instance->m_pool_size From f68098b4e9a279dc7f706bb9f904be5be721c40d Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Feb 2023 13:44:36 -0500 Subject: [PATCH 155/496] Fix CMake warning when HPX is not enabled --- cmake/kokkos_enable_options.cmake | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index 558a7353a2..7d8026989a 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -35,7 +35,6 @@ KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3 OFF "Whether code deprecated in major KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") -KOKKOS_ENABLE_OPTION(IMPL_HPX_ASYNC_DISPATCH ON "Whether HPX supports asynchronous dispatch") KOKKOS_ENABLE_OPTION(TESTS OFF "Whether to build the unit tests") KOKKOS_ENABLE_OPTION(BENCHMARKS OFF "Whether to build the benchmarks") KOKKOS_ENABLE_OPTION(EXAMPLES OFF "Whether to build the examples") @@ -100,6 +99,13 @@ ELSE() ENDIF() KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions") +IF (KOKKOS_ENABLE_HPX) + SET(HPX_ASYNC_DISPATCH_DEFAULT ON) +ELSE() + SET(HPX_ASYNC_DISPATCH_DEFAULT OFF) +ENDIF() +KOKKOS_ENABLE_OPTION(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") + Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") FUNCTION(check_device_specific_options) From c005e607a07f4c1fb9563ca6fce7c4151f9c5519 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Feb 2023 15:34:05 -0500 Subject: [PATCH 156/496] Pass *this to in_parallel in OpenMP::impl_thread_pool_size() --- core/src/OpenMP/Kokkos_OpenMP.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/OpenMP/Kokkos_OpenMP.cpp b/core/src/OpenMP/Kokkos_OpenMP.cpp index 6185c32ac8..4ad5238654 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -100,14 +100,15 @@ bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { int OpenMP::impl_thread_pool_size() const noexcept { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return OpenMP::in_parallel() + return OpenMP::in_parallel(*this) ? omp_get_num_threads() : (Impl::t_openmp_instance ? Impl::t_openmp_instance->m_pool_size : impl_internal_space_instance()->m_pool_size); #else - return OpenMP::in_parallel() ? omp_get_num_threads() - : impl_internal_space_instance()->m_pool_size; + return OpenMP::in_parallel(*this) + ? omp_get_num_threads() + : impl_internal_space_instance()->m_pool_size; #endif } From 568bc2cb4d417b81d995264662d5d82b64d7ea23 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Thu, 2 Feb 2023 13:52:10 -0700 Subject: [PATCH 157/496] Don't enable deprecated code 3 in Makefile builds anymore --- Makefile.kokkos | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 4fe9268b50..25080c66e3 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -521,7 +521,6 @@ endif #only add the c++ standard flags if this is not CMake tmp := $(call kokkos_append_header,"/* General Settings */") ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATED_CODE_3") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATED_CODE_4") endif ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS), 1) From 4c878a0a4deb86911aed4ba369fec89f49d175bb Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Thu, 2 Feb 2023 12:59:59 -0800 Subject: [PATCH 158/496] OpenMPTarget: Adding declare target for constexpr variables. Co-authored-by: Daniel Arndt --- core/src/Kokkos_View.hpp | 10 ++++++++++ core/src/impl/Kokkos_Atomic_Memory_Order.hpp | 11 +++++++++++ core/unit_test/TestNumericTraits.hpp | 6 ++++++ 3 files changed, 27 insertions(+) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index fb03c10e48..7686fa89aa 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -494,8 +494,18 @@ constexpr bool is_assignable(const Kokkos::View& dst, namespace Kokkos { +// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with +// the OpenMPTarget backend +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_PVC) +#pragma omp declare target +#endif + inline constexpr Kokkos::ALL_t ALL{}; +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_PVC) +#pragma omp end declare target +#endif + inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; diff --git a/core/src/impl/Kokkos_Atomic_Memory_Order.hpp b/core/src/impl/Kokkos_Atomic_Memory_Order.hpp index d8c6821267..0cb4f969fa 100644 --- a/core/src/impl/Kokkos_Atomic_Memory_Order.hpp +++ b/core/src/impl/Kokkos_Atomic_Memory_Order.hpp @@ -51,8 +51,19 @@ struct memory_order_relaxed_t { #endif static constexpr auto std_constant = std::memory_order_relaxed; }; + +// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with +// the OpenMPTarget backend +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_PVC) +#pragma omp declare target +#endif + constexpr memory_order_relaxed_t memory_order_relaxed = {}; +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_PVC) +#pragma omp end declare target +#endif + struct memory_order_acquire_t { using memory_order = memory_order_acquire_t; #if defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index 9146297cd8..03203f5d25 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -481,7 +481,13 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, round_error); CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, round_error); CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, denorm_min); CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, denorm_min); + +// FIXME_OPENMPTARGET - The static_assert causes issues on Intel GPUs with the +// OpenMPTarget backend. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ARCH_INTEL_PVC) CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min); +#endif + // clang-format off static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< float>::min(), ""); static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< double>::min(), ""); From f2ec98dab749a3630c6b220f096a377457d4b62e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 2 Feb 2023 16:17:46 -0500 Subject: [PATCH 159/496] Fix clang+cuda compiler warning about cudaDeviceSynchronize (#5846) * Fix clang+cuda compiler warning got cudaDeviceSynchronize * Restrict workaround to Clang --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 364d8c6416..d0e49e95de 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -128,9 +128,17 @@ void cuda_device_synchronize(const std::string &name) { name, Kokkos::Tools::Experimental::SpecialSynchronizationCases:: GlobalDeviceSynchronization, +#if defined(KOKKOS_COMPILER_CLANG) + // annotate with __host__ silence a clang warning about using + // cudaDeviceSynchronize in device code + [] __host__() { // TODO: correct device ID + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + }); +#else []() { // TODO: correct device ID KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); }); +#endif } void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, From 1d8dd90785795abc8982116c29f4d02897bfc783 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Thu, 2 Feb 2023 15:38:21 -0800 Subject: [PATCH 160/496] OpenMPTarget: Enable declare target for all Intel GPUs. --- core/src/Kokkos_View.hpp | 4 ++-- core/src/impl/Kokkos_Atomic_Memory_Order.hpp | 4 ++-- core/unit_test/TestNumericTraits.hpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 7686fa89aa..2af24918db 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -496,13 +496,13 @@ namespace Kokkos { // FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with // the OpenMPTarget backend -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_PVC) +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL) #pragma omp declare target #endif inline constexpr Kokkos::ALL_t ALL{}; -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_PVC) +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL) #pragma omp end declare target #endif diff --git a/core/src/impl/Kokkos_Atomic_Memory_Order.hpp b/core/src/impl/Kokkos_Atomic_Memory_Order.hpp index 0cb4f969fa..cf05236204 100644 --- a/core/src/impl/Kokkos_Atomic_Memory_Order.hpp +++ b/core/src/impl/Kokkos_Atomic_Memory_Order.hpp @@ -54,13 +54,13 @@ struct memory_order_relaxed_t { // FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with // the OpenMPTarget backend -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_PVC) +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL) #pragma omp declare target #endif constexpr memory_order_relaxed_t memory_order_relaxed = {}; -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_PVC) +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL) #pragma omp end declare target #endif diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index 03203f5d25..38c4f0e05d 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -484,7 +484,7 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, denorm_min); // FIXME_OPENMPTARGET - The static_assert causes issues on Intel GPUs with the // OpenMPTarget backend. -#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ARCH_INTEL_PVC) +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ARCH_INTEL) CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min); #endif From 743625604162e0a6d7deaddc44d1293964da92b0 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Feb 2023 22:16:35 -0500 Subject: [PATCH 161/496] Move { -> Cuda/}Kokkos_Cuda[Space].hpp --- core/src/{ => Cuda}/Kokkos_Cuda.hpp | 2 +- core/src/Cuda/Kokkos_CudaSpace.cpp | 4 ++-- core/src/{ => Cuda}/Kokkos_CudaSpace.hpp | 0 core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp | 2 +- core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp | 2 +- core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp | 2 +- core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp | 2 +- core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp | 2 +- core/src/decl/Kokkos_Declare_CUDA.hpp | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) rename core/src/{ => Cuda}/Kokkos_Cuda.hpp (99%) rename core/src/{ => Cuda}/Kokkos_CudaSpace.hpp (100%) diff --git a/core/src/Kokkos_Cuda.hpp b/core/src/Cuda/Kokkos_Cuda.hpp similarity index 99% rename from core/src/Kokkos_Cuda.hpp rename to core/src/Cuda/Kokkos_Cuda.hpp index fce7351b32..3e237a65db 100644 --- a/core/src/Kokkos_Cuda.hpp +++ b/core/src/Cuda/Kokkos_Cuda.hpp @@ -31,7 +31,7 @@ static_assert(false, #include #include -#include +#include #include // CUDA_SAFE_CALL #include diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp index 87b4c8c00c..5fb4f86414 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -22,8 +22,8 @@ #ifdef KOKKOS_ENABLE_CUDA #include -#include -#include +#include +#include #include #include diff --git a/core/src/Kokkos_CudaSpace.hpp b/core/src/Cuda/Kokkos_CudaSpace.hpp similarity index 100% rename from core/src/Kokkos_CudaSpace.hpp rename to core/src/Cuda/Kokkos_CudaSpace.hpp diff --git a/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp index 8e5d4a0706..b7df78a338 100644 --- a/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp +++ b/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -30,7 +30,7 @@ #include #include -#include +#include #include namespace Kokkos { diff --git a/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp b/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp index a586d30147..17de1be7a6 100644 --- a/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp +++ b/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp @@ -25,7 +25,7 @@ #include // GraphAccess needs to be complete -#include +#include #include namespace Kokkos { diff --git a/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp index 26face64c9..0609d0ffa6 100644 --- a/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include diff --git a/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp index 8509f10c03..abb747e39a 100644 --- a/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp +++ b/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp @@ -20,7 +20,7 @@ #include #ifdef KOKKOS_ENABLE_CUDA -#include +#include #include #include diff --git a/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index d5d11f499a..a945a716bc 100644 --- a/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -17,7 +17,7 @@ #ifndef KOKKOS_CUDA_WORKGRAPHPOLICY_HPP #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP -#include +#include #include namespace Kokkos { diff --git a/core/src/decl/Kokkos_Declare_CUDA.hpp b/core/src/decl/Kokkos_Declare_CUDA.hpp index 215b18f221..79d432a35e 100644 --- a/core/src/decl/Kokkos_Declare_CUDA.hpp +++ b/core/src/decl/Kokkos_Declare_CUDA.hpp @@ -18,7 +18,7 @@ #define KOKKOS_DECLARE_CUDA_HPP #if defined(KOKKOS_ENABLE_CUDA) -#include +#include #include #include #include From be83e9a8bfe3dace37cdbe840cf45e44a753e05d Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Feb 2023 22:17:40 -0500 Subject: [PATCH 162/496] Move { -> Serial/}Kokkos_Serial.hpp --- core/src/SYCL/Kokkos_SYCL.cpp | 1 - core/src/Serial/Kokkos_Serial.cpp | 2 +- core/src/{ => Serial}/Kokkos_Serial.hpp | 0 core/src/Serial/Kokkos_Serial_Task.hpp | 4 ++-- core/src/decl/Kokkos_Declare_SERIAL.hpp | 2 +- core/src/impl/Kokkos_Default_Graph_Impl.hpp | 2 +- 6 files changed, 5 insertions(+), 6 deletions(-) rename core/src/{ => Serial}/Kokkos_Serial.hpp (100%) diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index ed258b263a..c665631dd6 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/core/src/Serial/Kokkos_Serial.cpp b/core/src/Serial/Kokkos_Serial.cpp index df91d8499a..b5a1fcdd79 100644 --- a/core/src/Serial/Kokkos_Serial.cpp +++ b/core/src/Serial/Kokkos_Serial.cpp @@ -20,7 +20,7 @@ #include -#include +#include #include #include #include diff --git a/core/src/Kokkos_Serial.hpp b/core/src/Serial/Kokkos_Serial.hpp similarity index 100% rename from core/src/Kokkos_Serial.hpp rename to core/src/Serial/Kokkos_Serial.hpp diff --git a/core/src/Serial/Kokkos_Serial_Task.hpp b/core/src/Serial/Kokkos_Serial_Task.hpp index c744f34760..f9c86f55ce 100644 --- a/core/src/Serial/Kokkos_Serial_Task.hpp +++ b/core/src/Serial/Kokkos_Serial_Task.hpp @@ -22,9 +22,9 @@ #include -#include -#include +#include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/core/src/decl/Kokkos_Declare_SERIAL.hpp b/core/src/decl/Kokkos_Declare_SERIAL.hpp index bb59ae2ce8..6095901f05 100644 --- a/core/src/decl/Kokkos_Declare_SERIAL.hpp +++ b/core/src/decl/Kokkos_Declare_SERIAL.hpp @@ -18,7 +18,7 @@ #define KOKKOS_DECLARE_SERIAL_HPP #if defined(KOKKOS_ENABLE_SERIAL) -#include +#include #include #endif diff --git a/core/src/impl/Kokkos_Default_Graph_Impl.hpp b/core/src/impl/Kokkos_Default_Graph_Impl.hpp index 4c133f69f6..0b11b251bc 100644 --- a/core/src/impl/Kokkos_Default_Graph_Impl.hpp +++ b/core/src/impl/Kokkos_Default_Graph_Impl.hpp @@ -23,7 +23,7 @@ #include #include -#include +#include #include // FIXME @graph other backends? From 387de48b7b1fdea7c2b6ad2351e43c3673e96aa0 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Feb 2023 22:17:55 -0500 Subject: [PATCH 163/496] Move { -> Threads/}Kokkos_Threads.hpp --- core/src/{ => Threads}/Kokkos_Threads.hpp | 0 core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp | 2 +- core/src/decl/Kokkos_Declare_THREADS.hpp | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename core/src/{ => Threads}/Kokkos_Threads.hpp (100%) diff --git a/core/src/Kokkos_Threads.hpp b/core/src/Threads/Kokkos_Threads.hpp similarity index 100% rename from core/src/Kokkos_Threads.hpp rename to core/src/Threads/Kokkos_Threads.hpp diff --git a/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index 7c29ce5739..797044b117 100644 --- a/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -18,7 +18,7 @@ #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP #include -#include +#include namespace Kokkos { namespace Impl { diff --git a/core/src/decl/Kokkos_Declare_THREADS.hpp b/core/src/decl/Kokkos_Declare_THREADS.hpp index b7af04be3c..16f134b6f2 100644 --- a/core/src/decl/Kokkos_Declare_THREADS.hpp +++ b/core/src/decl/Kokkos_Declare_THREADS.hpp @@ -18,7 +18,7 @@ #define KOKKOS_DECLARE_THREADS_HPP #if defined(KOKKOS_ENABLE_THREADS) -#include +#include #include #endif From 14f9425af6a1f6d0badce9dedebd88f9eba5dec4 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Thu, 2 Feb 2023 21:53:24 -0800 Subject: [PATCH 164/496] OpenMPTarget: Replace KOKKOS_ARCH_INTEL with KOKKOS_COMPILER_INTEL to protect declare target on Intel GPUs. --- core/src/Kokkos_View.hpp | 4 ++-- core/src/impl/Kokkos_Atomic_Memory_Order.hpp | 4 ++-- core/unit_test/TestNumericTraits.hpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 2af24918db..345708e784 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -496,13 +496,13 @@ namespace Kokkos { // FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with // the OpenMPTarget backend -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL) +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL) #pragma omp declare target #endif inline constexpr Kokkos::ALL_t ALL{}; -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL) +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL) #pragma omp end declare target #endif diff --git a/core/src/impl/Kokkos_Atomic_Memory_Order.hpp b/core/src/impl/Kokkos_Atomic_Memory_Order.hpp index cf05236204..6d1bfb9c82 100644 --- a/core/src/impl/Kokkos_Atomic_Memory_Order.hpp +++ b/core/src/impl/Kokkos_Atomic_Memory_Order.hpp @@ -54,13 +54,13 @@ struct memory_order_relaxed_t { // FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with // the OpenMPTarget backend -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL) +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL) #pragma omp declare target #endif constexpr memory_order_relaxed_t memory_order_relaxed = {}; -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL) +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL) #pragma omp end declare target #endif diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index 38c4f0e05d..0302ac1d9b 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -484,7 +484,7 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, denorm_min); // FIXME_OPENMPTARGET - The static_assert causes issues on Intel GPUs with the // OpenMPTarget backend. -#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ARCH_INTEL) +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_COMPILER_INTEL) CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min); #endif From 86a44271142bd3b912e356aa284bdfb3c6f13161 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 3 Feb 2023 09:12:47 -0500 Subject: [PATCH 165/496] Drop (deprecated) KokkosCore_UnitTest_DefaultDeviceTypeInit_* from the makefile --- core/unit_test/Makefile | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/core/unit_test/Makefile b/core/unit_test/Makefile index 0c3e1ee476..05be225265 100644 --- a/core/unit_test/Makefile +++ b/core/unit_test/Makefile @@ -409,15 +409,6 @@ TEST_TARGETS += test-stack-trace TEST_TARGETS += test-stack-trace-terminate TEST_TARGETS += test-stack-trace-generic-term -ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) -NUM_INITTESTS = 16 -INITTESTS_NUMBERS := $(shell seq 1 ${NUM_INITTESTS}) -INITTESTS_TARGETS := $(addprefix KokkosCore_UnitTest_DefaultDeviceTypeInit_,${INITTESTS_NUMBERS}) -TARGETS += ${INITTESTS_TARGETS} -INITTESTS_TEST_TARGETS := $(addprefix test-default-init-,${INITTESTS_NUMBERS}) -TEST_TARGETS += ${INITTESTS_TEST_TARGETS} -endif - KokkosCore_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Cuda @@ -465,10 +456,6 @@ KokkosCore_UnitTest_PushFinalizeHook: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS) KokkosCore_UnitTest_PushFinalizeHook_terminate: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_PushFinalizeHook_terminate - -${INITTESTS_TARGETS}: KokkosCore_UnitTest_DefaultDeviceTypeInit_%: TestDefaultDeviceTypeInit_%.o UnitTestMain.o gtest-all.o $(KOKKOS_LINK_DEPENDS) - $(LINK) $(EXTRA_PATH) TestDefaultDeviceTypeInit_$*.o UnitTestMain.o gtest-all.o $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_DefaultDeviceTypeInit_$* - KokkosCore_UnitTest_StackTraceTestExec: TestStackTrace.o TestStackTrace_f0.o TestStackTrace_f1.o TestStackTrace_f2.o TestStackTrace_f3.o TestStackTrace_f4.o $(KOKKOS_LINK_DEPENDS) gtest-all.o $(LINK) $(EXTRA_PATH) TestStackTrace.o TestStackTrace_f0.o TestStackTrace_f1.o TestStackTrace_f2.o TestStackTrace_f3.o TestStackTrace_f4.o gtest-all.o $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_StackTraceTestExec @@ -521,10 +508,6 @@ test-stack-trace-terminate: KokkosCore_UnitTest_StackTraceTestExec test-stack-trace-generic-term: KokkosCore_UnitTest_StackTraceTestExec ./KokkosCore_UnitTest_StackTraceTestExec --gtest_filter=*generic_term$(STACK_TRACE_TERMINATE_FILTER) - -${INITTESTS_TEST_TARGETS}: test-default-init-%: KokkosCore_UnitTest_DefaultDeviceTypeInit_% - ./KokkosCore_UnitTest_DefaultDeviceTypeInit_$* - build_all: $(TARGETS) test: $(TEST_TARGETS) From 446532e3883918bb2941f1cc85fb65818b9919e9 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 3 Feb 2023 08:24:22 -0700 Subject: [PATCH 166/496] Update core/unit_test/TestNumericTraits.hpp Co-authored-by: Daniel Arndt --- core/unit_test/TestNumericTraits.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index 0302ac1d9b..735022a107 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -484,7 +484,7 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, denorm_min); // FIXME_OPENMPTARGET - The static_assert causes issues on Intel GPUs with the // OpenMPTarget backend. -#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_COMPILER_INTEL) +#if !(defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL)) CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min); #endif From 2aa257671b7e343add998d9772ef605c6d264d4c Mon Sep 17 00:00:00 2001 From: jstrzebonski Date: Fri, 3 Feb 2023 19:26:39 +0100 Subject: [PATCH 167/496] Dispatch Kokkos::sort(Kokkos::View) to SYCL oneDPL (#5229) * Dispatch Kokkos::sort(Kokkos::View) to SYCL oneDPL * Add oneDPL installation to SYCL docker * Add missing sycl docker dependency * Correctly add oneDPL as tpl * Remove (likely) redundant sycl::buffer usage * Change order of includes because of legacy TBB problem * Assert if View's memory_space is accessible from SYCL execution space * Assert if View's layout is contiguous * Refactor finding oneDPL * Enable oneDPL sort if KOKKOS_ENABLE_ONEDPL is defined * Fix possible legacy TBB problem * Corrections after rebase * Add FIXME * Fix formating * Fix typo * Minor cleanup * Use Experimental::begin/end, move * Try detecting toolchain version for oneDPL * Fix for the case that we don't try tp find oneDPL * Use SYCL::sycl_queue() member function * Use pointers again Kokkos iterators are assumed by oneDPL to be on the host * Specify full path for clang++ * Use oneDPL 2022.0.0 --------- Co-authored-by: Daniel Arndt Co-authored-by: Lauren McCoy --- algorithms/src/Kokkos_Sort.hpp | 31 +++++++++++++++++++++ cmake/KokkosCore_config.h.in | 1 + cmake/Modules/FindTPLONEDPL.cmake | 46 +++++++++++++++++++++++++++++++ cmake/kokkos_tpls.cmake | 2 ++ core/src/CMakeLists.txt | 1 + scripts/docker/Dockerfile.sycl | 6 ++++ 6 files changed, 87 insertions(+) create mode 100644 cmake/Modules/FindTPLONEDPL.cmake diff --git a/algorithms/src/Kokkos_Sort.hpp b/algorithms/src/Kokkos_Sort.hpp index 033de22164..8f3c6e35b6 100644 --- a/algorithms/src/Kokkos_Sort.hpp +++ b/algorithms/src/Kokkos_Sort.hpp @@ -66,6 +66,11 @@ #endif +#if defined(KOKKOS_ENABLE_ONEDPL) +#include +#include +#endif + namespace Kokkos { namespace Impl { @@ -634,6 +639,32 @@ sort(const ExecutionSpace& exec, bin_sort.sort(exec, view); } +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort(const Experimental::SYCL& space, + const Kokkos::View& view) { + using ViewType = Kokkos::View; + + static_assert(SpaceAccessibility::accessible, + "SYCL execution space is not able to access the memory space " + "of the View argument!"); + + auto queue = space.sycl_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + + // Can't use Experimental::begin/end here since the oneDPL then assumes that + // the data is on the host. + static_assert( + ViewType::rank == 1 && + (std::is_same::value || + std::is_same::value), + "SYCL sort only supports contiguous 1D Views."); + const int n = view.extent(0); + oneapi::dpl::sort(policy, view.data(), view.data() + n); +} +#endif + template std::enable_if_t<(Kokkos::is_execution_space::value) && (SpaceAccessibility< diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 509a0d44a2..520c45137c 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -62,6 +62,7 @@ #cmakedefine KOKKOS_ENABLE_LIBDL #cmakedefine KOKKOS_ENABLE_LIBQUADMATH #cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND +#cmakedefine KOKKOS_ENABLE_ONEDPL #cmakedefine KOKKOS_ARCH_SSE42 #cmakedefine KOKKOS_ARCH_ARMV80 diff --git a/cmake/Modules/FindTPLONEDPL.cmake b/cmake/Modules/FindTPLONEDPL.cmake new file mode 100644 index 0000000000..01791cff44 --- /dev/null +++ b/cmake/Modules/FindTPLONEDPL.cmake @@ -0,0 +1,46 @@ +INCLUDE(CheckIncludeFileCXX) +CHECK_INCLUDE_FILE_CXX(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) +CHECK_INCLUDE_FILE_CXX(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) + +INCLUDE(CheckCXXSourceCompiles) +CHECK_CXX_SOURCE_COMPILES(" + #include + + int main() + { + #if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE == 9 || _GLIBCXX_RELEASE == 10) + static_assert(false); + #endif + return 0; + }" + KOKKOS_NO_TBB_CONFLICT) + +IF (KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) + IF(KOKKOS_NO_TBB_CONFLICT) + KOKKOS_CREATE_IMPORTED_TPL( + ONEDPL INTERFACE + ) + ELSE() + KOKKOS_CREATE_IMPORTED_TPL( + ONEDPL INTERFACE + # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ + COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + ) + ENDIF() +ELSE() + FIND_PACKAGE(oneDPL REQUIRED) + + IF(KOKKOS_NO_TBB_CONFLICT) + KOKKOS_CREATE_IMPORTED_TPL( + ONEDPL INTERFACE + LINK_LIBRARIES oneDPL + ) + ELSE() + KOKKOS_CREATE_IMPORTED_TPL( + ONEDPL INTERFACE + LINK_LIBRARIES oneDPL + # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ + COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + ) + ENDIF() +ENDIF() diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index ac06f0848f..c768bfe8de 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -46,6 +46,7 @@ ELSE() SET(ROCM_DEFAULT OFF) ENDIF() KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) +KOKKOS_TPL_OPTION(ONEDPL ${Kokkos_ENABLE_SYCL}) IF (WIN32) SET(LIBDL_DEFAULT Off) @@ -85,6 +86,7 @@ IF (NOT WIN32) ENDIF() IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) KOKKOS_IMPORT_TPL(ROCM INTERFACE) + KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) ENDIF() KOKKOS_IMPORT_TPL(LIBQUADMATH) diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index 0be3d71682..09e91929d5 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -187,6 +187,7 @@ IF (NOT WIN32) ENDIF() IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) + KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) ENDIF() # FIXME: We need a proper solution to figure out whether to enable diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index bda1197fc6..4e185f4c1b 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -10,6 +10,7 @@ RUN apt-get update && apt-get install -y \ ninja-build \ python3 \ git \ + libomp-dev \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -49,3 +50,8 @@ RUN wget https://cloud.cees.ornl.gov/download/oneapi-for-nvidia-gpus-2023.0.0-li chmod +x oneapi-for-nvidia-gpus-2023.0.0-linux.sh && \ ./oneapi-for-nvidia-gpus-2023.0.0-linux.sh -y && \ rm oneapi-for-nvidia-gpus-2023.0.0-linux.sh + +RUN wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19133/l_oneDPL_p_2022.0.0.25335.sh &&\ + chmod +x ./l_oneDPL_p_2022.0.0.25335.sh && \ + ./l_oneDPL_p_2022.0.0.25335.sh -a -s --eula accept && \ + rm l_oneDPL_p_2022.0.0.25335.sh From a4af6f7b6f80bba0e38f4763664e93c6440892c0 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 3 Feb 2023 15:42:00 -0500 Subject: [PATCH 168/496] Add Kokkos::num_threads() and Kokkos::device_id() --- core/src/Kokkos_Core.hpp | 3 +++ core/src/impl/Kokkos_Core.cpp | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/core/src/Kokkos_Core.hpp b/core/src/Kokkos_Core.hpp index cf898a71e7..2df89e1df3 100644 --- a/core/src/Kokkos_Core.hpp +++ b/core/src/Kokkos_Core.hpp @@ -99,6 +99,9 @@ void declare_configuration_metadata(const std::string& category, [[nodiscard]] bool is_initialized() noexcept; [[nodiscard]] bool is_finalized() noexcept; +[[nodiscard]] int device_id() noexcept; +[[nodiscard]] int num_threads() noexcept; + bool show_warnings() noexcept; bool tune_internals() noexcept; diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 5e53e42659..7d7380b3db 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -165,6 +165,26 @@ bool is_valid_map_device_id_by(std::string const& x) { } // namespace +[[nodiscard]] int Kokkos::device_id() noexcept { +#if defined(KOKKOS_ENABLE_CUDA) + return Cuda().cuda_device(); +#elif defined(KOKKOS_ENABLE_HIP) + return HIP().hip_device(); +#elif defined(KOKKOS_ENABLE_OPENACC) + return Experimental::OpenACC().acc_device_number(); +#elif defined(KOKKOS_ENABLE_OPENMPTARGET) + return omp_get_default_device(); // FIXME_OPENMPTARGET +#elif defined(KOKKOS_ENABLE_SYCL) + return Experimental::Impl::SYCLInternal::m_syclDev; +#else + return -1; +#endif +} + +[[nodiscard]] int Kokkos::num_threads() noexcept { + return DefaultHostExecutionSpace().concurrency(); +} + Kokkos::Impl::ExecSpaceManager& Kokkos::Impl::ExecSpaceManager::get_instance() { static ExecSpaceManager space_initializer = {}; return space_initializer; From d8d9c58c321ad9efec32e92c8ea17b87c5fac632 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 3 Feb 2023 15:42:46 -0500 Subject: [PATCH 169/496] Check Kokkos::num_threads and device_id in tests --- core/unit_test/UnitTest_DeviceAndThreads.cpp | 23 ++++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/core/unit_test/UnitTest_DeviceAndThreads.cpp b/core/unit_test/UnitTest_DeviceAndThreads.cpp index 2b3c90ef2a..b522ac3e69 100644 --- a/core/unit_test/UnitTest_DeviceAndThreads.cpp +++ b/core/unit_test/UnitTest_DeviceAndThreads.cpp @@ -38,21 +38,24 @@ int get_device_count() { } int get_device_id() { + int device_id; #if defined(KOKKOS_ENABLE_CUDA) - int device; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDevice(&device)); - return device; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDevice(&device_id)); #elif defined(KOKKOS_ENABLE_HIP) - int device_id; KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&device_id)); - return device_id; #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - return omp_get_device_num(); + device_id = omp_get_device_num(); #elif defined(KOKKOS_ENABLE_OPENACC) - return acc_get_device_num(acc_get_device_type()); + device_id = acc_get_device_num(acc_get_device_type()); +#elif defined(KOKKOS_ENABLE_SYCL) + // FIXME_SYCL ? + assert(false); + return -2; #else - return -1; + device_id = -1; #endif + assert(device_id == Kokkos::device_id()); + return device_id; } int get_max_threads() { @@ -66,7 +69,9 @@ int get_max_threads() { } int get_num_threads() { - return Kokkos::DefaultHostExecutionSpace().concurrency(); + int const num_threads = Kokkos::DefaultHostExecutionSpace().concurrency(); + assert(num_threads == Kokkos::num_threads()); + return num_threads; } int get_disable_warnings() { return !Kokkos::show_warnings(); } From 8b19e2de5bed89e23d01d5f3a6bc8cd344e52172 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 3 Feb 2023 23:00:45 -0500 Subject: [PATCH 170/496] Drop (unused) Impl::destruct_delete utility --- core/src/impl/Kokkos_Utilities.hpp | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/core/src/impl/Kokkos_Utilities.hpp b/core/src/impl/Kokkos_Utilities.hpp index c3504ffce5..71035dfeaa 100644 --- a/core/src/impl/Kokkos_Utilities.hpp +++ b/core/src/impl/Kokkos_Utilities.hpp @@ -74,25 +74,6 @@ struct is_specialization_of, Template> : std::true_type {}; // end is_specialization_of }}}1 //============================================================================== -//============================================================================== -// destruct_delete is a unique_ptr deleter for objects -// created by placement new into already allocated memory -// by only calling the destructor on the object. -// -// Because unique_ptr never calls its deleter with a nullptr value, -// no need to check if p == nullptr. -// -// Note: This differs in interface from std::default_delete in that the -// function call operator is templated instead of the class, to make -// it easier to use and disallow specialization. -struct destruct_delete { - template - KOKKOS_INLINE_FUNCTION constexpr void operator()(T* p) const noexcept { - p->~T(); - } -}; -//============================================================================== - //============================================================================== // {{{1 From 344826082619eccc870a151917544fa4f219d8c2 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Sat, 4 Feb 2023 21:55:18 -0500 Subject: [PATCH 171/496] Remove unused impl/CMakeLists.txt --- core/src/impl/CMakeLists.txt | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 core/src/impl/CMakeLists.txt diff --git a/core/src/impl/CMakeLists.txt b/core/src/impl/CMakeLists.txt deleted file mode 100644 index 203fd4a3a4..0000000000 --- a/core/src/impl/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ - -SET(HEADERS "") -SET(SOURCES "") - -FILE(GLOB HEADERS *.hpp *.h) -FILE(GLOB SOURCES *.cpp) - -TRIBITS_ADD_LIBRARY( - kokkoscore_impl - NOINSTALLHEADERS ${HEADERS} - SOURCES ${SOURCES} - DEPLIBS - ) - -SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) - -INSTALL(FILES ${HEADERS} DESTINATION ${TRILINOS_INCDIR}/impl/) - From bbde3b1344a027c745d63b9a4ac26a86dc2f452a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Sat, 4 Feb 2023 21:56:32 -0500 Subject: [PATCH 172/496] Remove pointless dummy source file in core --- core/src/dummy.cpp | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 core/src/dummy.cpp diff --git a/core/src/dummy.cpp b/core/src/dummy.cpp deleted file mode 100644 index 929380b6c3..0000000000 --- a/core/src/dummy.cpp +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -namespace Kokkos { -namespace AvoidCompilerWarnings { -int dontComplain() { - // keep the compiler from complaining about emptiness - return 0; -} -} // namespace AvoidCompilerWarnings -} // namespace Kokkos From 5235c89d5bc457da7a780e4397d9ecedab747255 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 25 Aug 2022 15:48:32 +0200 Subject: [PATCH 173/496] Port ViewFill performance tests --- core/perf_test/Benchmark_Context.cpp | 60 ++++++ core/perf_test/Benchmark_Context.hpp | 2 + core/perf_test/CMakeLists.txt | 6 + core/perf_test/PerfTest_ViewCopy.hpp | 1 - core/perf_test/PerfTest_ViewCopy_Raw.cpp | 2 + core/perf_test/PerfTest_ViewFill.cpp | 63 +++++++ core/perf_test/PerfTest_ViewFill.hpp | 229 +++++++---------------- core/perf_test/PerfTest_ViewFill_123.cpp | 37 +++- core/perf_test/PerfTest_ViewFill_45.cpp | 27 ++- core/perf_test/PerfTest_ViewFill_6.cpp | 17 +- core/perf_test/PerfTest_ViewFill_7.cpp | 17 +- core/perf_test/PerfTest_ViewFill_8.cpp | 29 ++- 12 files changed, 301 insertions(+), 189 deletions(-) create mode 100644 core/perf_test/Benchmark_Context.cpp create mode 100644 core/perf_test/PerfTest_ViewFill.cpp diff --git a/core/perf_test/Benchmark_Context.cpp b/core/perf_test/Benchmark_Context.cpp new file mode 100644 index 0000000000..ebca0964cf --- /dev/null +++ b/core/perf_test/Benchmark_Context.cpp @@ -0,0 +1,60 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +namespace Test { + +/** + * \brief Mark the label as a figure of merit. + */ +std::string benchmark_fom(const std::string& label) { return "FOM: " + label; } + +void add_benchmark_context(bool verbose) { + std::ostringstream msg; + Kokkos::print_configuration(msg, verbose); + benchmark::AddCustomContext("Kokkos configuration", msg.str()); +} + +} // namespace Test diff --git a/core/perf_test/Benchmark_Context.hpp b/core/perf_test/Benchmark_Context.hpp index e6c086561a..af61f0c014 100644 --- a/core/perf_test/Benchmark_Context.hpp +++ b/core/perf_test/Benchmark_Context.hpp @@ -34,6 +34,8 @@ namespace KokkosBenchmark { +std::string benchmark_fom(const std::string& label); + /// \brief Remove unwanted spaces and colon signs from input string. In case of /// invalid input it will return an empty string. std::string remove_unwanted_characters(std::string str) { diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 5df5e389ee..f16452ba52 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -200,6 +200,12 @@ SET( PerfTest_ViewCopy_c8.cpp PerfTest_ViewCopy_d8.cpp PerfTest_ViewCopy_Raw.cpp + PerfTest_ViewFill.cpp + PerfTest_ViewFill_123.cpp + PerfTest_ViewFill_45.cpp + PerfTest_ViewFill_6.cpp + PerfTest_ViewFill_7.cpp + PerfTest_ViewFill_8.cpp ) KOKKOS_ADD_BENCHMARK( diff --git a/core/perf_test/PerfTest_ViewCopy.hpp b/core/perf_test/PerfTest_ViewCopy.hpp index 573237a447..ea645f91f7 100644 --- a/core/perf_test/PerfTest_ViewCopy.hpp +++ b/core/perf_test/PerfTest_ViewCopy.hpp @@ -168,7 +168,6 @@ static void ViewDeepCopy_Raw(benchmark::State& state) { Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = b_ptr[i]; }); Kokkos::fence(); - report_results(state, a.size(), timer.seconds()); } } diff --git a/core/perf_test/PerfTest_ViewCopy_Raw.cpp b/core/perf_test/PerfTest_ViewCopy_Raw.cpp index 67a8d7e555..66f0793283 100644 --- a/core/perf_test/PerfTest_ViewCopy_Raw.cpp +++ b/core/perf_test/PerfTest_ViewCopy_Raw.cpp @@ -16,6 +16,8 @@ #include "PerfTest_ViewCopy.hpp" +#include + namespace Test { #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) diff --git a/core/perf_test/PerfTest_ViewFill.cpp b/core/perf_test/PerfTest_ViewFill.cpp new file mode 100644 index 0000000000..c98768fa57 --- /dev/null +++ b/core/perf_test/PerfTest_ViewFill.cpp @@ -0,0 +1,63 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +#include + +namespace Test { + +void report_results_fill(benchmark::State& state, double time) { + state.SetIterationTime(time); + const auto N8 = std::pow(state.range(0), 8); + const auto size = N8 * 8 / 1024 / 1024; + + state.counters["MB"] = benchmark::Counter(size, benchmark::Counter::kDefaults, + benchmark::Counter::OneK::kIs1024); + state.counters[benchmark_fom("GB/s")] = + benchmark::Counter(size / 1024 / time, benchmark::Counter::kDefaults, + benchmark::Counter::OneK::kIs1024); +} + +} // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill.hpp b/core/perf_test/PerfTest_ViewFill.hpp index 7f76ed0ff7..da073464c1 100644 --- a/core/perf_test/PerfTest_ViewFill.hpp +++ b/core/perf_test/PerfTest_ViewFill.hpp @@ -15,201 +15,114 @@ //@HEADER #include -#include -#include -#include +#include +#include namespace Test { +void report_results_fill(benchmark::State& state, double time); + template -double fill_view(ViewType& a, typename ViewType::const_value_type& val, - int repeat) { - Kokkos::Timer timer; - for (int i = 0; i < repeat; i++) { +void fill_view(ViewType& a, typename ViewType::const_value_type& val, + benchmark::State& state) { + for (auto _ : state) { + Kokkos::fence(); + Kokkos::Timer timer; Kokkos::deep_copy(a, val); + report_results_fill(state, timer.seconds()); } - Kokkos::fence(); - return timer.seconds(); } template -void run_fillview_tests123(int N, int R) { - const int N1 = N; +static void ViewFill_Rank1(benchmark::State& state) { + const int N1 = state.range(0); const int N2 = N1 * N1; - const int N3 = N2 * N1; const int N4 = N2 * N2; const int N8 = N4 * N4; - double time1, time2, time3, time_raw = 100000.0; - { - Kokkos::View a("A1", N8); - time1 = fill_view(a, 1.1, R) / R; - } - { - Kokkos::View a("A2", N4, N4); - time2 = fill_view(a, 1.1, R) / R; - } - { - Kokkos::View a("A3", N3, N3, N2); - time3 = fill_view(a, 1.1, R) / R; - } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - { - Kokkos::View a("A1", N8); - double* a_ptr = a.data(); - Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::parallel_for( - N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; }); - } - Kokkos::fence(); - time_raw = timer.seconds() / R; - } -#endif - double size = 1.0 * N8 * 8 / 1024 / 1024; - printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, - size / 1024 / time_raw); - printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size, - size / 1024 / time1); - printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size, - size / 1024 / time2); - printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size, - size / 1024 / time3); + Kokkos::View a("A1", N8); + fill_view(a, 1.1, state); } template -void run_fillview_tests45(int N, int R) { - const int N1 = N; +static void ViewFill_Rank2(benchmark::State& state) { + const int N1 = state.range(0); const int N2 = N1 * N1; const int N4 = N2 * N2; - const int N8 = N4 * N4; - double time4, time5, time_raw = 100000.0; - { - Kokkos::View a("A4", N2, N2, N2, N2); - time4 = fill_view(a, 1.1, R) / R; - } - { - Kokkos::View a("A5", N2, N2, N1, N1, N2); - time5 = fill_view(a, 1.1, R) / R; - } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - { - Kokkos::View a("A1", N8); - double* a_ptr = a.data(); - Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::parallel_for( - N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; }); - } - Kokkos::fence(); - time_raw = timer.seconds() / R; - } -#endif - double size = 1.0 * N8 * 8 / 1024 / 1024; - printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, - size / 1024 / time_raw); - printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size, - size / 1024 / time4); - printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size, - size / 1024 / time5); + Kokkos::View a("A2", N4, N4); + fill_view(a, 1.1, state); } template -void run_fillview_tests6(int N, int R) { - const int N1 = N; +static void ViewFill_Rank3(benchmark::State& state) { + const int N1 = state.range(0); const int N2 = N1 * N1; - const int N4 = N2 * N2; - const int N8 = N4 * N4; + const int N3 = N2 * N1; - double time6, time_raw = 100000.0; - { - Kokkos::View a("A6", N2, N1, N1, N1, N1, N2); - time6 = fill_view(a, 1.1, R) / R; - } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - { - Kokkos::View a("A1", N8); - double* a_ptr = a.data(); - Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::parallel_for( - N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; }); - } - Kokkos::fence(); - time_raw = timer.seconds() / R; - } -#endif - double size = 1.0 * N8 * 8 / 1024 / 1024; - printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, - size / 1024 / time_raw); - printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size, - size / 1024 / time6); + Kokkos::View a("A3", N3, N3, N2); + fill_view(a, 1.1, state); } template -void run_fillview_tests7(int N, int R) { - const int N1 = N; +static void ViewFill_Rank4(benchmark::State& state) { + const int N1 = state.range(0); const int N2 = N1 * N1; - const int N4 = N2 * N2; - const int N8 = N4 * N4; - double time7, time_raw = 100000.0; - { - Kokkos::View a("A7", N2, N1, N1, N1, N1, N1, N1); - time7 = fill_view(a, 1.1, R) / R; - } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - { - Kokkos::View a("A1", N8); - double* a_ptr = a.data(); - Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::parallel_for( - N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; }); - } - Kokkos::fence(); - time_raw = timer.seconds() / R; - } -#endif - double size = 1.0 * N8 * 8 / 1024 / 1024; - printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, - size / 1024 / time_raw); - printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size, - size / 1024 / time7); + Kokkos::View a("A4", N2, N2, N2, N2); + fill_view(a, 1.1, state); } template -void run_fillview_tests8(int N, int R) { - const int N1 = N; +static void ViewFill_Rank5(benchmark::State& state) { + const int N1 = state.range(0); const int N2 = N1 * N1; - const int N4 = N2 * N2; - const int N8 = N4 * N4; - double time8, time_raw = 100000.0; - { - Kokkos::View a("A8", N1, N1, N1, N1, N1, N1, N1, - N1); - time8 = fill_view(a, 1.1, R) / R; - } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - { - Kokkos::View a("A1", N8); - double* a_ptr = a.data(); + Kokkos::View a("A5", N2, N2, N1, N1, N2); + fill_view(a, 1.1, state); +} + +template +static void ViewFill_Rank6(benchmark::State& state) { + const int N1 = state.range(0); + const int N2 = N1 * N1; + + Kokkos::View a("A6", N2, N1, N1, N1, N1, N2); + fill_view(a, 1.1, state); +} + +template +static void ViewFill_Rank7(benchmark::State& state) { + const int N1 = state.range(0); + const int N2 = N1 * N1; + + Kokkos::View a("A7", N2, N1, N1, N1, N1, N1, N1); + fill_view(a, 1.1, state); +} + +template +static void ViewFill_Rank8(benchmark::State& state) { + const int N1 = state.range(0); + + Kokkos::View a("A8", N1, N1, N1, N1, N1, N1, N1, N1); + fill_view(a, 1.1, state); +} + +template +static void ViewFill_Raw(benchmark::State& state) { + const int N8 = std::pow(state.range(0), 8); + + Kokkos::View a("A1", N8); + double* a_ptr = a.data(); + + for (auto _ : state) { Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::parallel_for( - N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; }); - } + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; }); Kokkos::fence(); - time_raw = timer.seconds() / R; + + report_results_fill(state, timer.seconds()); } -#endif - double size = 1.0 * N8 * 8 / 1024 / 1024; - printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, - size / 1024 / time_raw); - printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size, - size / 1024 / time8); } } // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill_123.cpp b/core/perf_test/PerfTest_ViewFill_123.cpp index 1e050e2311..9ef2afeca4 100644 --- a/core/perf_test/PerfTest_ViewFill_123.cpp +++ b/core/perf_test/PerfTest_ViewFill_123.cpp @@ -17,10 +17,35 @@ #include namespace Test { -TEST(default_exec, ViewFill_Rank123) { - printf("ViewFill Performance for LayoutLeft:\n"); - run_fillview_tests123(10, 1); - printf("ViewFill Performance for LayoutRight:\n"); - run_fillview_tests123(10, 1); -} + +BENCHMARK(ViewFill_Rank1) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +BENCHMARK(ViewFill_Rank1) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +BENCHMARK(ViewFill_Rank2) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +BENCHMARK(ViewFill_Rank2) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +BENCHMARK(ViewFill_Rank3) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +BENCHMARK(ViewFill_Rank3) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + } // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill_45.cpp b/core/perf_test/PerfTest_ViewFill_45.cpp index 2bf93b2048..2d5e75022b 100644 --- a/core/perf_test/PerfTest_ViewFill_45.cpp +++ b/core/perf_test/PerfTest_ViewFill_45.cpp @@ -17,10 +17,25 @@ #include namespace Test { -TEST(default_exec, ViewFill_Rank45) { - printf("ViewFill Performance for LayoutLeft:\n"); - run_fillview_tests45(10, 1); - printf("ViewFill Performance for LayoutRight:\n"); - run_fillview_tests45(10, 1); -} + +BENCHMARK(ViewFill_Rank4) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +BENCHMARK(ViewFill_Rank4) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +BENCHMARK(ViewFill_Rank5) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +BENCHMARK(ViewFill_Rank5) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + } // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill_6.cpp b/core/perf_test/PerfTest_ViewFill_6.cpp index 588a1e2293..640c46ca89 100644 --- a/core/perf_test/PerfTest_ViewFill_6.cpp +++ b/core/perf_test/PerfTest_ViewFill_6.cpp @@ -17,10 +17,15 @@ #include namespace Test { -TEST(default_exec, ViewFill_Rank6) { - printf("ViewFill Performance for LayoutLeft:\n"); - run_fillview_tests6(10, 1); - printf("ViewFill Performance for LayoutRight:\n"); - run_fillview_tests6(10, 1); -} + +BENCHMARK(ViewFill_Rank6) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +BENCHMARK(ViewFill_Rank6) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + } // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill_7.cpp b/core/perf_test/PerfTest_ViewFill_7.cpp index fffeb951c9..025ddafe9b 100644 --- a/core/perf_test/PerfTest_ViewFill_7.cpp +++ b/core/perf_test/PerfTest_ViewFill_7.cpp @@ -17,10 +17,15 @@ #include namespace Test { -TEST(default_exec, ViewFill_Rank7) { - printf("ViewFill Performance for LayoutLeft:\n"); - run_fillview_tests7(10, 1); - printf("ViewFill Performance for LayoutRight:\n"); - run_fillview_tests7(10, 1); -} + +BENCHMARK(ViewFill_Rank7) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +BENCHMARK(ViewFill_Rank7) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + } // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill_8.cpp b/core/perf_test/PerfTest_ViewFill_8.cpp index b2188af1a3..0fe733e534 100644 --- a/core/perf_test/PerfTest_ViewFill_8.cpp +++ b/core/perf_test/PerfTest_ViewFill_8.cpp @@ -17,10 +17,27 @@ #include namespace Test { -TEST(default_exec, ViewFill_Rank8) { - printf("ViewFill Performance for LayoutLeft:\n"); - run_fillview_tests8(10, 1); - printf("ViewFill Performance for LayoutRight:\n"); - run_fillview_tests8(10, 1); -} + +BENCHMARK(ViewFill_Rank8) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +BENCHMARK(ViewFill_Rank8) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) +BENCHMARK(ViewFill_Raw) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); + +BENCHMARK(ViewFill_Raw) + ->ArgName("N") + ->Arg(10) + ->UseManualTime(); +#endif + } // namespace Test From 1b2d07a005b3e981219f03fab2de1d2d6db7e76c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 16 Sep 2022 20:51:56 +0200 Subject: [PATCH 174/496] Port ViewResize tests --- core/perf_test/CMakeLists.txt | 6 + core/perf_test/PerfTest_ViewFill.cpp | 2 +- core/perf_test/PerfTest_ViewResize.cpp | 64 +++ core/perf_test/PerfTest_ViewResize.hpp | 510 +++++++++------------ core/perf_test/PerfTest_ViewResize_123.cpp | 93 +++- core/perf_test/PerfTest_ViewResize_45.cpp | 56 ++- core/perf_test/PerfTest_ViewResize_6.cpp | 32 +- core/perf_test/PerfTest_ViewResize_7.cpp | 32 +- core/perf_test/PerfTest_ViewResize_8.cpp | 38 +- 9 files changed, 515 insertions(+), 318 deletions(-) create mode 100644 core/perf_test/PerfTest_ViewResize.cpp diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index f16452ba52..5df2c95d23 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -206,6 +206,12 @@ SET( PerfTest_ViewFill_6.cpp PerfTest_ViewFill_7.cpp PerfTest_ViewFill_8.cpp + PerfTest_ViewResize.cpp + PerfTest_ViewResize_123.cpp + PerfTest_ViewResize_45.cpp + PerfTest_ViewResize_6.cpp + PerfTest_ViewResize_7.cpp + PerfTest_ViewResize_8.cpp ) KOKKOS_ADD_BENCHMARK( diff --git a/core/perf_test/PerfTest_ViewFill.cpp b/core/perf_test/PerfTest_ViewFill.cpp index c98768fa57..255dd309dd 100644 --- a/core/perf_test/PerfTest_ViewFill.cpp +++ b/core/perf_test/PerfTest_ViewFill.cpp @@ -55,7 +55,7 @@ void report_results_fill(benchmark::State& state, double time) { state.counters["MB"] = benchmark::Counter(size, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024); - state.counters[benchmark_fom("GB/s")] = + state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter(size / 1024 / time, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024); } diff --git a/core/perf_test/PerfTest_ViewResize.cpp b/core/perf_test/PerfTest_ViewResize.cpp new file mode 100644 index 0000000000..5715b19fa2 --- /dev/null +++ b/core/perf_test/PerfTest_ViewResize.cpp @@ -0,0 +1,64 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +#include + +namespace Test { + +void report_results_resize(benchmark::State& state, double time) { + state.SetIterationTime(time); + const auto N8 = std::pow(state.range(0), 8); + // data size in megabytes + const auto size = N8 * 8 / 1000 / 1000; + // data processed in gigabytes + const auto data_processed = 2.0 * size / 1000; + + state.counters["MB"] = benchmark::Counter(size); + state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter( + data_processed, benchmark::Counter::kIsIterationInvariantRate); +} + +} // namespace Test diff --git a/core/perf_test/PerfTest_ViewResize.hpp b/core/perf_test/PerfTest_ViewResize.hpp index dfcd3f1347..60c866df51 100644 --- a/core/perf_test/PerfTest_ViewResize.hpp +++ b/core/perf_test/PerfTest_ViewResize.hpp @@ -15,346 +15,290 @@ //@HEADER #include -#include -#include -#include +#include +#include +#include namespace Test { +void report_results_resize(benchmark::State& state, double time); + template -void run_resizeview_tests123(int N, int R) { - const int N1 = N; - const int N2 = N1 * N1; - const int N3 = N2 * N1; - const int N4 = N2 * N2; - const int N8 = N4 * N4; - - double time1, time2, time3, time_raw = 100000.0; - double time1_noinit, time2_noinit, time3_noinit; - { - Kokkos::View a("A1", N8); - Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(a_, int(N8 * 1.1)); - } - time1 = timer.seconds() / R; - } - { - Kokkos::View a("A2", N4, N4); - Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(a_, int(N4 * 1.1), N4); - } - time2 = timer.seconds() / R; - } - { - Kokkos::View a("A3", N3, N3, N2); - Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(a_, int(N3 * 1.1), N3, N2); - } - time3 = timer.seconds() / R; - } - { - Kokkos::View a("A1", N8); +static void ViewResize_Rank1(benchmark::State& state) { + const int N8 = std::pow(state.range(0), 8); + Kokkos::View a("A1", N8); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N8 * 1.1)); - } - time1_noinit = timer.seconds() / R; + Kokkos::resize(a_, int(N8 * 1.1)); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } - { - Kokkos::View a("A2", N4, N4); +} + +template +static void ViewResize_Rank2(benchmark::State& state) { + const int N4 = std::pow(state.range(0), 4); + Kokkos::View a("A2", N4, N4); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N4 * 1.1), N4); - } - time2_noinit = timer.seconds() / R; + Kokkos::resize(a_, int(N4 * 1.1), N4); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } - { - Kokkos::View a("A3", N3, N3, N2); +} + +template +static void ViewResize_Rank3(benchmark::State& state) { + const int N2 = std::pow(state.range(0), 2); + const int N3 = std::pow(state.range(0), 3); + Kokkos::View a("A3", N3, N3, N2); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N3 * 1.1), N3, N2); - } - time3_noinit = timer.seconds() / R; + Kokkos::resize(a_, int(N3 * 1.1), N3, N2); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - { - Kokkos::View a("A1", N8); - double* a_ptr = a.data(); +} + +template +static void ViewResize_Rank4(benchmark::State& state) { + const int N2 = std::pow(state.range(0), 2); + Kokkos::View a("A4", N2, N2, N2, N2); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a1( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); - double* a1_ptr = a1.data(); - Kokkos::parallel_for( - N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); - Kokkos::fence(); - } - Kokkos::fence(); - time_raw = timer.seconds() / R; + Kokkos::resize(a_, int(N2 * 1.1), N2, N2, N2); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } -#endif - double size = 1.0 * N8 * 8 / 1024 / 1024; - printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, - 2.0 * size / 1024 / time_raw); - printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size, - 2.0 * size / 1024 / time1); - printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size, - 2.0 * size / 1024 / time2); - printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size, - 2.0 * size / 1024 / time3); - printf(" Rank1 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", - time1_noinit, size, 2.0 * size / 1024 / time1_noinit); - printf(" Rank2 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", - time2_noinit, size, 2.0 * size / 1024 / time2_noinit); - printf(" Rank3 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", - time3_noinit, size, 2.0 * size / 1024 / time3_noinit); } template -void run_resizeview_tests45(int N, int R) { - const int N1 = N; +static void ViewResize_Rank5(benchmark::State& state) { + const int N1 = state.range(0); const int N2 = N1 * N1; - const int N4 = N2 * N2; - const int N8 = N4 * N4; - double time4, time5, time_raw = 100000.0; - double time4_noinit, time5_noinit; - { - Kokkos::View a("A4", N2, N2, N2, N2); + Kokkos::View a("A5", N2, N2, N1, N1, N2); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(a_, int(N2 * 1.1), N2, N2, N2); - } - time4 = timer.seconds() / R; + Kokkos::resize(a_, int(N2 * 1.1), N2, N1, N1, N2); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } - { - Kokkos::View a("A5", N2, N2, N1, N1, N2); +} + +template +static void ViewResize_Rank6(benchmark::State& state) { + const int N1 = state.range(0); + const int N2 = N1 * N1; + + Kokkos::View a("A6", N2, N1, N1, N1, N1, N2); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(a_, int(N2 * 1.1), N2, N1, N1, N2); - } - time5 = timer.seconds() / R; + Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N2); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } - { - Kokkos::View a("A4", N2, N2, N2, N2); +} + +template +static void ViewResize_Rank7(benchmark::State& state) { + const int N1 = state.range(0); + const int N2 = N1 * N1; + + Kokkos::View a("A7", N2, N1, N1, N1, N1, N1, N1); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N2, - N2); - } - time4_noinit = timer.seconds() / R; + Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N1, N1); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } - { - Kokkos::View a("A5", N2, N2, N1, N1, N2); +} + +template +static void ViewResize_Rank8(benchmark::State& state) { + const int N1 = state.range(0); + + Kokkos::View a("A8", N1, N1, N1, N1, N1, N1, N1, N1); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N1, N1, - N2); - } - time5_noinit = timer.seconds() / R; + Kokkos::resize(a_, int(N1 * 1.1), N1, N1, N1, N1, N1, N1, N1); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - { - Kokkos::View a("A1", N8); - double* a_ptr = a.data(); +} + +template +static void ViewResize_NoInit_Rank1(benchmark::State& state) { + const int N8 = std::pow(state.range(0), 8); + Kokkos::View a("A1", N8); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a1( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); - double* a1_ptr = a1.data(); - Kokkos::parallel_for( - N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); - Kokkos::fence(); - } - Kokkos::fence(); - time_raw = timer.seconds() / R; + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N8 * 1.1)); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } -#endif - double size = 1.0 * N8 * 8 / 1024 / 1024; - printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, - 2.0 * size / 1024 / time_raw); - printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size, - 2.0 * size / 1024 / time4); - printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size, - 2.0 * size / 1024 / time5); - printf(" Rank4 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", - time4_noinit, size, 2.0 * size / 1024 / time4_noinit); - printf(" Rank5 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", - time5_noinit, size, 2.0 * size / 1024 / time5_noinit); } template -void run_resizeview_tests6(int N, int R) { - const int N1 = N; - const int N2 = N1 * N1; - const int N4 = N2 * N2; - const int N8 = N4 * N4; +static void ViewResize_NoInit_Rank2(benchmark::State& state) { + const int N4 = std::pow(state.range(0), 4); + Kokkos::View a("A2", N4, N4); + Kokkos::View a_(a); - double time6, time6_noinit, time_raw = 100000.0; - { - Kokkos::View a("A6", N2, N1, N1, N1, N1, N2); + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N2); - } - time6 = timer.seconds() / R; + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N4 * 1.1), N4); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } - { - Kokkos::View a("A6", N2, N1, N1, N1, N1, N2); +} + +template +static void ViewResize_NoInit_Rank3(benchmark::State& state) { + const int N2 = std::pow(state.range(0), 2); + const int N3 = std::pow(state.range(0), 3); + Kokkos::View a("A3", N3, N3, N2); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1, - N1, N2); - } - time6_noinit = timer.seconds() / R; + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N3 * 1.1), N3, N2); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - { - Kokkos::View a("A1", N8); - double* a_ptr = a.data(); +} + +template +static void ViewResize_NoInit_Rank4(benchmark::State& state) { + const int N2 = std::pow(state.range(0), 2); + Kokkos::View a("A4", N2, N2, N2, N2); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a1( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); - double* a1_ptr = a1.data(); - Kokkos::parallel_for( - N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); - Kokkos::fence(); - } - Kokkos::fence(); - time_raw = timer.seconds() / R; + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N2, N2); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } -#endif - double size = 1.0 * N8 * 8 / 1024 / 1024; - printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, - 2.0 * size / 1024 / time_raw); - printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size, - 2.0 * size / 1024 / time6); - printf(" Rank6 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", - time6_noinit, size, 2.0 * size / 1024 / time6_noinit); } template -void run_resizeview_tests7(int N, int R) { - const int N1 = N; +static void ViewResize_NoInit_Rank5(benchmark::State& state) { + const int N1 = state.range(0); const int N2 = N1 * N1; - const int N4 = N2 * N2; - const int N8 = N4 * N4; - double time7, time7_noinit, time_raw = 100000.0; - { - Kokkos::View a("A7", N2, N1, N1, N1, N1, N1, N1); - Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N1, N1); - } - time7 = timer.seconds() / R; - } - { - Kokkos::View a("A7", N2, N1, N1, N1, N1, N1, N1); + Kokkos::View a("A5", N2, N2, N1, N1, N2); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1, - N1, N1, N1); - } - time7_noinit = timer.seconds() / R; + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N1, N1, + N2); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - { - Kokkos::View a("A1", N8); - double* a_ptr = a.data(); +} + +template +static void ViewResize_NoInit_Rank6(benchmark::State& state) { + const int N1 = state.range(0); + const int N2 = N1 * N1; + + Kokkos::View a("A6", N2, N1, N1, N1, N1, N2); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a1( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); - double* a1_ptr = a1.data(); - Kokkos::parallel_for( - N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); - Kokkos::fence(); - } - Kokkos::fence(); - time_raw = timer.seconds() / R; + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1, + N1, N2); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } -#endif - double size = 1.0 * N8 * 8 / 1024 / 1024; - printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, - 2.0 * size / 1024 / time_raw); - printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size, - 2.0 * size / 1024 / time7); - printf(" Rank7 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", - time7_noinit, size, 2.0 * size / 1024 / time7_noinit); } template -void run_resizeview_tests8(int N, int R) { - const int N1 = N; +static void ViewResize_NoInit_Rank7(benchmark::State& state) { + const int N1 = state.range(0); const int N2 = N1 * N1; - const int N4 = N2 * N2; - const int N8 = N4 * N4; - double time8, time8_noinit, time_raw = 100000.0; - { - Kokkos::View a("A8", N1, N1, N1, N1, N1, N1, N1, - N1); + Kokkos::View a("A7", N2, N1, N1, N1, N1, N1, N1); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(a_, int(N1 * 1.1), N1, N1, N1, N1, N1, N1, N1); - } - time8 = timer.seconds() / R; + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1, + N1, N1, N1); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } - { - Kokkos::View a("A8", N1, N1, N1, N1, N1, N1, N1, - N1); +} + +template +static void ViewResize_NoInit_Rank8(benchmark::State& state) { + const int N1 = state.range(0); + + Kokkos::View a("A8", N1, N1, N1, N1, N1, N1, N1, N1); + Kokkos::View a_(a); + + for (auto _ : state) { + Kokkos::fence(); Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a_(a); - Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N1 * 1.1), N1, N1, N1, - N1, N1, N1, N1); - } - time8_noinit = timer.seconds() / R; + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N1 * 1.1), N1, N1, N1, + N1, N1, N1, N1); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - { - Kokkos::View a("A1", N8); - double* a_ptr = a.data(); +} + +template +static void ViewResize_NoInit_Raw(benchmark::State& state) { + const int N8 = std::pow(state.range(0), 8); + Kokkos::View a("A1", N8); + double* a_ptr = a.data(); + + for (auto _ : state) { Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a1( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); - double* a1_ptr = a1.data(); - Kokkos::parallel_for( - N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); - Kokkos::fence(); - } - Kokkos::fence(); - time_raw = timer.seconds() / R; + Kokkos::View a1( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); + double* a1_ptr = a1.data(); + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); + Kokkos::fence(); + report_results_resize(state, timer.seconds()); } -#endif - double size = 1.0 * N8 * 8 / 1024 / 1024; - printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, - 2.0 * size / 1024 / time_raw); - printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size, - 2.0 * size / 1024 / time8); - printf(" Rank8 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", - time8_noinit, size, 2.0 * size / 1024 / time8_noinit); } } // namespace Test diff --git a/core/perf_test/PerfTest_ViewResize_123.cpp b/core/perf_test/PerfTest_ViewResize_123.cpp index ed2e58192c..35b48523bb 100644 --- a/core/perf_test/PerfTest_ViewResize_123.cpp +++ b/core/perf_test/PerfTest_ViewResize_123.cpp @@ -18,11 +18,92 @@ namespace Test { -TEST(default_exec, ViewResize_Rank123) { - printf("Resize View Performance for LayoutLeft:\n"); - run_resizeview_tests123(10, 1); - printf("Resize View Performance for LayoutRight:\n"); - run_resizeview_tests123(10, 1); -} +static constexpr int R = 10; + +BENCHMARK(ViewResize_Rank1) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_Rank1) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_Rank2) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_Rank2) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_Rank3) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_Rank3) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank1) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank1) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank2) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank2) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank3) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank3) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) +BENCHMARK(ViewResize_NoInit_Raw) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Raw) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); +#endif } // namespace Test diff --git a/core/perf_test/PerfTest_ViewResize_45.cpp b/core/perf_test/PerfTest_ViewResize_45.cpp index 69028fab08..76a5209209 100644 --- a/core/perf_test/PerfTest_ViewResize_45.cpp +++ b/core/perf_test/PerfTest_ViewResize_45.cpp @@ -18,11 +18,55 @@ namespace Test { -TEST(default_exec, ViewResize_Rank_45) { - printf("Resize View Performance for LayoutLeft:\n"); - run_resizeview_tests45(10, 1); - printf("Resize View Performance for LayoutRight:\n"); - run_resizeview_tests45(10, 1); -} +static constexpr int R = 10; +static constexpr int N = 10; + +BENCHMARK(ViewResize_Rank4) + ->ArgName("N") + ->Arg(N) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_Rank4) + ->ArgName("N") + ->Arg(N) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_Rank5) + ->ArgName("N") + ->Arg(N) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_Rank5) + ->ArgName("N") + ->Arg(N) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank4) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank4) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank5) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank5) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); } // namespace Test diff --git a/core/perf_test/PerfTest_ViewResize_6.cpp b/core/perf_test/PerfTest_ViewResize_6.cpp index 486b44a0c1..623a39e637 100644 --- a/core/perf_test/PerfTest_ViewResize_6.cpp +++ b/core/perf_test/PerfTest_ViewResize_6.cpp @@ -18,11 +18,31 @@ namespace Test { -TEST(default_exec, ViewResize_Rank6) { - printf("Resize View Performance for LayoutLeft:\n"); - run_resizeview_tests6(10, 1); - printf("Resize View Performance for LayoutRight:\n"); - run_resizeview_tests6(10, 1); -} +static constexpr int R = 10; +static constexpr int N = 10; + +BENCHMARK(ViewResize_Rank6) + ->ArgName("N") + ->Arg(N) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_Rank6) + ->ArgName("N") + ->Arg(N) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank6) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank6) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); } // namespace Test diff --git a/core/perf_test/PerfTest_ViewResize_7.cpp b/core/perf_test/PerfTest_ViewResize_7.cpp index 84c2a79ad6..9ecf320e64 100644 --- a/core/perf_test/PerfTest_ViewResize_7.cpp +++ b/core/perf_test/PerfTest_ViewResize_7.cpp @@ -18,11 +18,31 @@ namespace Test { -TEST(default_exec, ViewResize_Rank7) { - printf("Resize View Performance for LayoutLeft:\n"); - run_resizeview_tests7(10, 1); - printf("Resize View Performance for LayoutRight:\n"); - run_resizeview_tests7(10, 1); -} +static constexpr int R = 10; +static constexpr int N = 10; + +BENCHMARK(ViewResize_Rank7) + ->ArgName("N") + ->Arg(N) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_Rank7) + ->ArgName("N") + ->Arg(N) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank7) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank7) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); } // namespace Test diff --git a/core/perf_test/PerfTest_ViewResize_8.cpp b/core/perf_test/PerfTest_ViewResize_8.cpp index 25910fb575..c213160102 100644 --- a/core/perf_test/PerfTest_ViewResize_8.cpp +++ b/core/perf_test/PerfTest_ViewResize_8.cpp @@ -18,19 +18,37 @@ namespace Test { -TEST(default_exec, ViewResize_Rank8) { +static constexpr int R = 10; + // FIXME_SYCL Avoid running out of resources on the CUDA GPU used in the CI #ifdef KOKKOS_ENABLE_SYCL - printf("Resize View Performance for LayoutLeft:\n"); - run_resizeview_tests8(9, 1); - printf("Resize View Performance for LayoutRight:\n"); - run_resizeview_tests8(9, 1); +static constexpr int N = 9; #else - printf("Resize View Performance for LayoutLeft:\n"); - run_resizeview_tests8(10, 1); - printf("Resize View Performance for LayoutRight:\n"); - run_resizeview_tests8(10, 1); +static constexpr int N = 10; #endif -} + +BENCHMARK(ViewResize_Rank8) + ->ArgName("N") + ->Arg(N) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_Rank8) + ->ArgName("N") + ->Arg(N) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank8) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Rank8) + ->ArgName("N") + ->Arg(10) + ->UseManualTime() + ->Iterations(R); } // namespace Test From 9126797fe287b94832ec8f07bdfcc930492ae2d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 19 Sep 2022 12:20:24 +0200 Subject: [PATCH 175/496] Clean-up Benchmark_Context and hide implementation details --- core/perf_test/Benchmark_Context.cpp | 45 +++++++++++++++--- core/perf_test/Benchmark_Context.hpp | 68 ++++------------------------ 2 files changed, 49 insertions(+), 64 deletions(-) diff --git a/core/perf_test/Benchmark_Context.cpp b/core/perf_test/Benchmark_Context.cpp index ebca0964cf..7cb0079fd9 100644 --- a/core/perf_test/Benchmark_Context.cpp +++ b/core/perf_test/Benchmark_Context.cpp @@ -44,17 +44,50 @@ #include -namespace Test { +namespace KokkosBenchmark { /** - * \brief Mark the label as a figure of merit. + * \brief Remove unwanted spaces and colon signs from input string. In case of + * invalid input it will return an empty string. */ -std::string benchmark_fom(const std::string& label) { return "FOM: " + label; } +std::string remove_unwanted_characters(const std::string& str) { + auto from = str.find_first_not_of(" :"); + auto to = str.find_last_not_of(" :"); -void add_benchmark_context(bool verbose) { + if (from == std::string::npos || to == std::string::npos) { + return ""; + } + + // return extracted part of string without unwanted spaces and colon signs + return str.substr(from, to + 1); +} + +/** + * \brief Extract all key:value pairs from kokkos configuration and add it to + * the benchmark context + */ +void add_kokkos_configuration(bool verbose) { std::ostringstream msg; Kokkos::print_configuration(msg, verbose); - benchmark::AddCustomContext("Kokkos configuration", msg.str()); + + // Iterate over lines returned from kokkos and extract key:value pairs + std::stringstream ss{msg.str()}; + for (std::string line; std::getline(ss, line, '\n');) { + auto found = line.find_first_of(':'); + if (found != std::string::npos) { + auto val = remove_unwanted_characters(line.substr(found + 1)); + // Ignore line without value, for example a category name + if (!val.empty()) { + benchmark::AddCustomContext( + remove_unwanted_characters(line.substr(0, found)), val); + } + } + } +} + +void add_benchmark_context(bool verbose) { + // Add Kokkos configuration to benchmark context data + add_kokkos_configuration(verbose); } -} // namespace Test +} // namespace KokkosBenchmark diff --git a/core/perf_test/Benchmark_Context.hpp b/core/perf_test/Benchmark_Context.hpp index af61f0c014..55d95d9395 100644 --- a/core/perf_test/Benchmark_Context.hpp +++ b/core/perf_test/Benchmark_Context.hpp @@ -34,64 +34,16 @@ namespace KokkosBenchmark { -std::string benchmark_fom(const std::string& label); - -/// \brief Remove unwanted spaces and colon signs from input string. In case of -/// invalid input it will return an empty string. -std::string remove_unwanted_characters(std::string str) { - auto from = str.find_first_not_of(" :"); - auto to = str.find_last_not_of(" :"); - - if (from == std::string::npos || to == std::string::npos) { - return ""; - } - - // return extracted part of string without unwanted spaces and colon signs - return str.substr(from, to + 1); -} - -/// \brief Extract all key:value pairs from kokkos configuration and add it to -/// the benchmark context -void add_kokkos_configuration(bool verbose) { - std::ostringstream msg; - Kokkos::print_configuration(msg, verbose); - - // Iterate over lines returned from kokkos and extract key:value pairs - std::stringstream ss{msg.str()}; - for (std::string line; std::getline(ss, line, '\n');) { - auto found = line.find_first_of(':'); - if (found != std::string::npos) { - auto val = remove_unwanted_characters(line.substr(found + 1)); - // Ignore line without value, for example a category name - if (!val.empty()) { - benchmark::AddCustomContext( - remove_unwanted_characters(line.substr(0, found)), val); - } - } - } -} - -/// \brief Add all data related to git to benchmark context -void add_git_info() { - if (!Kokkos::Impl::GIT_BRANCH.empty()) { - benchmark::AddCustomContext("GIT_BRANCH", Kokkos::Impl::GIT_BRANCH); - benchmark::AddCustomContext("GIT_COMMIT_HASH", - Kokkos::Impl::GIT_COMMIT_HASH); - benchmark::AddCustomContext("GIT_CLEAN_STATUS", - Kokkos::Impl::GIT_CLEAN_STATUS); - benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION", - Kokkos::Impl::GIT_COMMIT_DESCRIPTION); - benchmark::AddCustomContext("GIT_COMMIT_DATE", - Kokkos::Impl::GIT_COMMIT_DATE); - } -} - -/// \brief Gather all context information and add it to benchmark context data -void add_benchmark_context(bool verbose = false) { - // Add Kokkos configuration to benchmark context data - add_kokkos_configuration(verbose); - // Add git information to benchmark context data - add_git_info(); +/** + * \brief Gather all context information and add it to benchmark context data + */ +void add_benchmark_context(bool verbose = false); + +/** + * \brief Mark the label as a figure of merit. + */ +inline std::string benchmark_fom(const std::string& label) { + return "FOM: " + label; } } // namespace KokkosBenchmark From 66e53a9555426b7d1a3c9dc0b5a9a40c1f2f6bc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 21 Sep 2022 14:53:06 +0200 Subject: [PATCH 176/496] Remove redundant include --- core/perf_test/PerfTest_Category.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/perf_test/PerfTest_Category.hpp b/core/perf_test/PerfTest_Category.hpp index 126c961147..0cfbea9dd3 100644 --- a/core/perf_test/PerfTest_Category.hpp +++ b/core/perf_test/PerfTest_Category.hpp @@ -17,8 +17,6 @@ #ifndef KOKKOS_TEST_PERFTEST_CAT_HPP #define KOKKOS_TEST_PERFTEST_CAT_HPP -#include - namespace Test { extern int command_line_num_args(int n = 0); From 7c9f640ee662d51fa4a65df8a30523efd1d46c1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 21 Sep 2022 14:53:27 +0200 Subject: [PATCH 177/496] Port ViewAllocate tests --- core/perf_test/CMakeLists.txt | 13 +- core/perf_test/PerfTest_ViewAllocate.cpp | 284 ++++++++++++++++------- 2 files changed, 196 insertions(+), 101 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 5df2c95d23..bd07893576 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -14,17 +14,6 @@ IF(KOKKOS_ENABLE_TESTS) PerfTestHexGrad.cpp PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp - PerfTest_ViewAllocate.cpp - PerfTest_ViewFill_123.cpp - PerfTest_ViewFill_45.cpp - PerfTest_ViewFill_6.cpp - PerfTest_ViewFill_7.cpp - PerfTest_ViewFill_8.cpp - PerfTest_ViewResize_123.cpp - PerfTest_ViewResize_45.cpp - PerfTest_ViewResize_6.cpp - PerfTest_ViewResize_7.cpp - PerfTest_ViewResize_8.cpp ) IF(Kokkos_ENABLE_OPENMPTARGET) @@ -179,6 +168,8 @@ ENDFUNCTION() SET( BENCHMARK_SOURCES BenchmarkMain.cpp + Benchmark_Context.cpp + PerfTest_ViewAllocate.cpp PerfTest_ViewCopy_a123.cpp PerfTest_ViewCopy_b123.cpp PerfTest_ViewCopy_c123.cpp diff --git a/core/perf_test/PerfTest_ViewAllocate.cpp b/core/perf_test/PerfTest_ViewAllocate.cpp index c1d9895847..3129f99ee7 100644 --- a/core/perf_test/PerfTest_ViewAllocate.cpp +++ b/core/perf_test/PerfTest_ViewAllocate.cpp @@ -15,119 +15,223 @@ //@HEADER #include -#include -#include -#include +#include +#include namespace Test { +static constexpr int N = 10; + +void report_results_allocate(benchmark::State& state, double time) { + state.SetIterationTime(time); + const auto N8 = std::pow(state.range(0), 8); + const auto size = 1.0 * N8 * 8 / 1024 / 1024; + + state.counters["MB"] = benchmark::Counter(size, benchmark::Counter::kDefaults, + benchmark::Counter::OneK::kIs1024); + state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = + benchmark::Counter(size / 1024 / time, benchmark::Counter::kDefaults, + benchmark::Counter::OneK::kIs1024); +} + template -void run_allocateview_tests(int N, int R) { - const int N1 = N; - const int N2 = N * N; - const int N3 = N2 * N; - const int N4 = N2 * N2; - const int N8 = N4 * N4; - - double time1, time2, time3, time4, time5, time6, time7, time8, - time_raw = 100000.0; - { +static void ViewAllocate_Rank1(benchmark::State& state) { + const int N8 = std::pow(state.range(0), 8); + + for (auto _ : state) { Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a("A1", N8); - } - time1 = timer.seconds() / R; + Kokkos::View a("A1", N8); + report_results_allocate(state, timer.seconds()); } - { +} + +template +static void ViewAllocate_Rank2(benchmark::State& state) { + const int N4 = std::pow(state.range(0), 4); + + for (auto _ : state) { Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a("A2", N4, N4); - } - time2 = timer.seconds() / R; + Kokkos::View a("A2", N4, N4); + report_results_allocate(state, timer.seconds()); } - { +} + +template +static void ViewAllocate_Rank3(benchmark::State& state) { + const int N2 = std::pow(state.range(0), 2); + const int N3 = std::pow(state.range(0), 3); + + for (auto _ : state) { Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a("A3", N3, N3, N2); - } - time3 = timer.seconds() / R; + Kokkos::View a("A3", N3, N3, N2); + report_results_allocate(state, timer.seconds()); } - { +} + +template +static void ViewAllocate_Rank4(benchmark::State& state) { + const int N2 = std::pow(state.range(0), 2); + + for (auto _ : state) { Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a("A4", N2, N2, N2, N2); - } - time4 = timer.seconds() / R; + Kokkos::View a("A4", N2, N2, N2, N2); + report_results_allocate(state, timer.seconds()); } - { +} + +template +static void ViewAllocate_Rank5(benchmark::State& state) { + const int N1 = state.range(0); + const int N2 = N1 * N1; + + for (auto _ : state) { Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a("A5", N2, N2, N1, N1, N2); - } - time5 = timer.seconds() / R; + Kokkos::View a("A5", N2, N2, N1, N1, N2); + report_results_allocate(state, timer.seconds()); } - { +} + +template +static void ViewAllocate_Rank6(benchmark::State& state) { + const int N1 = state.range(0); + const int N2 = N1 * N1; + + for (auto _ : state) { Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a("A6", N2, N1, N1, N1, N1, N2); - } - time6 = timer.seconds() / R; + Kokkos::View a("A6", N2, N1, N1, N1, N1, N2); + report_results_allocate(state, timer.seconds()); } - { +} + +template +static void ViewAllocate_Rank7(benchmark::State& state) { + const int N1 = state.range(0); + const int N2 = N1 * N1; + + for (auto _ : state) { Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a("A7", N2, N1, N1, N1, N1, N1, N1); - } - time7 = timer.seconds() / R; + Kokkos::View a("A7", N2, N1, N1, N1, N1, N1, N1); + report_results_allocate(state, timer.seconds()); } - { +} + +template +static void ViewAllocate_Rank8(benchmark::State& state) { + const int N1 = state.range(0); + + for (auto _ : state) { Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - Kokkos::View a("A8", N1, N1, N1, N1, N1, N1, N1, - N1); - } - time8 = timer.seconds() / R; + Kokkos::View a("A8", N1, N1, N1, N1, N1, N1, N1, + N1); + report_results_allocate(state, timer.seconds()); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - { +} + +template +static void ViewAllocate_Raw(benchmark::State& state) { + const int N8 = std::pow(state.range(0), 8); + for (auto _ : state) { Kokkos::Timer timer; - for (int r = 0; r < R; r++) { - double* a_ptr = - static_cast(Kokkos::kokkos_malloc("A", sizeof(double) * N8)); - Kokkos::parallel_for( - N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; }); - Kokkos::fence(); - Kokkos::kokkos_free(a_ptr); - } - time_raw = timer.seconds() / R; + double* a_ptr = + static_cast(Kokkos::kokkos_malloc("A", sizeof(double) * N8)); + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; }); + Kokkos::fence(); + Kokkos::kokkos_free(a_ptr); + report_results_allocate(state, timer.seconds()); } -#endif - double size = 1.0 * N8 * 8 / 1024 / 1024; - printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, - size / 1024 / time_raw); - printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size, - size / 1024 / time1); - printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size, - size / 1024 / time2); - printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size, - size / 1024 / time3); - printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size, - size / 1024 / time4); - printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size, - size / 1024 / time5); - printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size, - size / 1024 / time6); - printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size, - size / 1024 / time7); - printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size, - size / 1024 / time8); } -TEST(default_exec, ViewCreate) { - printf("Create View Performance for LayoutLeft:\n"); - run_allocateview_tests(10, 1); - printf("Create View Performance for LayoutRight:\n"); - run_allocateview_tests(10, 1); -} +BENCHMARK(ViewAllocate_Rank1) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank1) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank2) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank2) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank3) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank3) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank4) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank4) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank5) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank5) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank6) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank6) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank7) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank7) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank8) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Rank8) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) +BENCHMARK(ViewAllocate_Raw) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewAllocate_Raw) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); +#endif } // namespace Test From 063fe9a14800baf2d3ef1db03ee163d20dcc4f04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 21 Sep 2022 17:10:38 +0200 Subject: [PATCH 178/496] Port HexGrad tests --- core/perf_test/CMakeLists.txt | 2 +- core/perf_test/PerfTestHexGrad.cpp | 86 +++++++++--------------------- 2 files changed, 27 insertions(+), 61 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index bd07893576..0fb2419867 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -11,7 +11,6 @@ IF(KOKKOS_ENABLE_TESTS) SET(SOURCES PerfTestMain.cpp PerfTestGramSchmidt.cpp - PerfTestHexGrad.cpp PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp ) @@ -169,6 +168,7 @@ SET( BENCHMARK_SOURCES BenchmarkMain.cpp Benchmark_Context.cpp + PerfTestHexGrad.cpp PerfTest_ViewAllocate.cpp PerfTest_ViewCopy_a123.cpp PerfTest_ViewCopy_b123.cpp diff --git a/core/perf_test/PerfTestHexGrad.cpp b/core/perf_test/PerfTestHexGrad.cpp index ef92de7ce1..e1d89dd9dd 100644 --- a/core/perf_test/PerfTestHexGrad.cpp +++ b/core/perf_test/PerfTestHexGrad.cpp @@ -15,7 +15,8 @@ //@HEADER #include -#include +#include +#include #include namespace Test { @@ -195,78 +196,43 @@ struct HexGrad { //-------------------------------------------------------------------------- - static double test(const int count, const int iter = 1) { + static double test(const int count) { elem_coord_type coord("coord", count); elem_grad_type grad("grad", count); // Execute the parallel kernels on the arrays: - - double dt_min = 0; - Kokkos::parallel_for(count, Init(coord)); execution_space().fence(); - for (int i = 0; i < iter; ++i) { - Kokkos::Timer timer; - Kokkos::parallel_for(count, HexGrad(coord, grad)); - execution_space().fence(); - const double dt = timer.seconds(); - if (0 == i) - dt_min = dt; - else - dt_min = dt < dt_min ? dt : dt_min; - } - - return dt_min; + Kokkos::Timer timer; + Kokkos::parallel_for(count, HexGrad(coord, grad)); + execution_space().fence(); + return timer.seconds(); } }; -template -void run_test_hexgrad(int exp_beg, int exp_end, int num_trials, - const char deviceTypeName[]) { - std::string label_hexgrad; - label_hexgrad.append("\"HexGrad< double , "); - label_hexgrad.append(deviceTypeName); - label_hexgrad.append(" >\""); - - for (int i = exp_beg; i < exp_end; ++i) { - double min_seconds = 0.0; - double max_seconds = 0.0; - double avg_seconds = 0.0; - - const int parallel_work_length = 1 << i; - - for (int j = 0; j < num_trials; ++j) { - const double seconds = HexGrad::test(parallel_work_length); - - if (0 == j) { - min_seconds = seconds; - max_seconds = seconds; - } else { - if (seconds < min_seconds) min_seconds = seconds; - if (seconds > max_seconds) max_seconds = seconds; - } - avg_seconds += seconds; - } - avg_seconds /= num_trials; +template +static void HexGrad_Benchmark(benchmark::State& state) { + const auto parallel_work_length = state.range(0); + + for (auto _ : state) { + const auto time = + HexGrad::test( + parallel_work_length); - std::cout << label_hexgrad << " , " << parallel_work_length << " , " - << min_seconds << " , " << (min_seconds / parallel_work_length) - << avg_seconds << std::endl; + state.SetIterationTime(time); + state.counters["Count"] = benchmark::Counter(parallel_work_length); + state.counters["Time normalized"] = + benchmark::Counter(time / parallel_work_length); } } -TEST(default_exec, hexgrad) { - int exp_beg = 10; - int exp_end = 20; - int num_trials = 5; - - if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1)); - if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2)); - if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3)); - - EXPECT_NO_THROW(run_test_hexgrad( - exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name())); -} +BENCHMARK(HexGrad_Benchmark) + ->ArgName("count") + ->ArgsProduct({ + benchmark::CreateRange(1 << 10, 1 << 19, 2), + }) + ->UseManualTime() + ->Iterations(5); } // namespace Test From 4b8e0e1cca2cbc0773771a9f074ec3f46b9270d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 22 Sep 2022 16:39:29 +0200 Subject: [PATCH 179/496] Port Atomic MinMax tests --- core/perf_test/CMakeLists.txt | 17 + core/perf_test/test_atomic_minmax_simple.cpp | 495 +++++++++++-------- 2 files changed, 306 insertions(+), 206 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 0fb2419867..6566ca9a76 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -89,6 +89,14 @@ IF(KOKKOS_ENABLE_TESTS) ENDIF() # Find or download google/benchmark library +IF(NOT Kokkos_ENABLE_BENCHMARKS) + RETURN() +ENDIF() + +IF (KOKKOS_HAS_TRILINOS) + message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") +ENDIF() + find_package(benchmark QUIET) IF(benchmark_FOUND) MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") @@ -209,3 +217,12 @@ KOKKOS_ADD_BENCHMARK( PerformanceTest_Benchmark SOURCES ${BENCHMARK_SOURCES} ) + +IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) + KOKKOS_ADD_BENCHMARK( + Benchmark_Atomic_MinMax + SOURCES + Benchmark_Context.cpp + test_atomic_minmax_simple.cpp + ) +ENDIF() diff --git a/core/perf_test/test_atomic_minmax_simple.cpp b/core/perf_test/test_atomic_minmax_simple.cpp index 4c2ae5c2d1..f4fde65848 100644 --- a/core/perf_test/test_atomic_minmax_simple.cpp +++ b/core/perf_test/test_atomic_minmax_simple.cpp @@ -21,240 +21,323 @@ // core/src/libkokkoscore.a -ldl && OMP_NUM_THREADS=1 // ./test_atomic_minmax_simple.x 10000000 -#include -#include - -#include -#include +#include +#include +#include #include -#include using exec_space = Kokkos::DefaultExecutionSpace; +constexpr int LENGTH = 1000000; + +template +Kokkos::View prepare_input(const int length, const T value) { + Kokkos::View input("input", length); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { input(i) = value; }); + Kokkos::fence(); + return input; +} + +int get_length(benchmark::State& state) { + return (Test::command_line_num_args() == 2) + ? std::stoi(Test::command_line_arg(1)) + : state.range(0); +} + template -void test(const int length) { +int check_errors_replacement(Kokkos::View view) { + int errors = 0; + Kokkos::parallel_reduce( + view.size(), + KOKKOS_LAMBDA(const int i, int& inner) { inner += (view(i) != (T)i); }, + errors); + Kokkos::fence(); + return errors; +} + +template +double atomic_min_replacement(Kokkos::View input) { + const int length = input.size(); Kokkos::Timer timer; + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_fetch_min(&(input(i)), (T)i); + }); + Kokkos::fence(); + return timer.seconds(); +} - using vector = Kokkos::View; - - vector inp("input", length); - T max = std::numeric_limits::max(); - T min = std::numeric_limits::lowest(); - - // input is max values - all min atomics will replace - { - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { - (void)Kokkos::atomic_fetch_min(&(inp(i)), (T)i); - }); - Kokkos::fence(); - double time = timer.seconds(); - - int errors(0); - Kokkos::parallel_reduce( - length, - KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); }, - errors); - Kokkos::fence(); - - if (errors) { - std::cerr << "Error in 100% min replacements: " << errors << std::endl; - std::cerr << "inp(0)=" << inp(0) << std::endl; +template +static void Atomic_MinReplacements(benchmark::State& state) { + const int length = get_length(state); + auto inp = prepare_input(length, std::numeric_limits::max()); + + for (auto _ : state) { + const auto time = atomic_min_replacement(inp); + const auto errors = check_errors_replacement(inp); + + // report results + state.SetIterationTime(time); + if (errors > 0) { + state.counters["Errors"] = benchmark::Counter(errors); } - std::cout << "Time for 100% min replacements: " << time << std::endl; } +} + +template +double atomic_max_replacement(Kokkos::View input) { + const int length = input.size(); + Kokkos::Timer timer; + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_max_fetch(&(input(i)), (T)i); + }); + Kokkos::fence(); + return timer.seconds(); +} - // input is min values - all max atomics will replace - { - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { - (void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i); - }); - Kokkos::fence(); - double time = timer.seconds(); - - int errors(0); - Kokkos::parallel_reduce( - length, - KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); }, - errors); - Kokkos::fence(); - - if (errors) { - std::cerr << "Error in 100% max replacements: " << errors << std::endl; - std::cerr << "inp(0)=" << inp(0) << std::endl; +template +static void Atomic_MaxReplacements(benchmark::State& state) { + const auto length = get_length(state); + auto inp = prepare_input(length, std::numeric_limits::lowest()); + + for (auto _ : state) { + const auto time = atomic_max_replacement(inp); + const auto errors = check_errors_replacement(inp); + + // report results + state.SetIterationTime(time); + if (errors > 0) { + state.counters["Errors"] = benchmark::Counter(errors); } - std::cout << "Time for 100% max replacements: " << time << std::endl; } +} - // input is max values - all max atomics will early exit - { - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { - (void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i); - }); - Kokkos::fence(); - double time = timer.seconds(); - - int errors(0); - Kokkos::parallel_reduce( - length, - KOKKOS_LAMBDA(const int i, int& inner) { - T ref = max; - inner += (inp(i) != ref); - }, - errors); - Kokkos::fence(); - - if (errors) { - std::cerr << "Error in 100% max early exits: " << errors << std::endl; - std::cerr << "inp(0)=" << inp(0) << std::endl; +template +int check_errors_early_exit(Kokkos::View view, const T ref) { + int errors = 0; + Kokkos::parallel_reduce( + view.size(), + KOKKOS_LAMBDA(const int i, int& inner) { inner += (view(i) != ref); }, + errors); + Kokkos::fence(); + return errors; +} + +template +static void Atomic_MaxEarlyExits(benchmark::State& state) { + const auto length = get_length(state); + auto inp = prepare_input(length, std::numeric_limits::max()); + + for (auto _ : state) { + const auto time = atomic_max_replacement(inp); + const auto errors = + check_errors_early_exit(inp, std::numeric_limits::max()); + + // report results + state.SetIterationTime(time); + if (errors > 0) { + state.counters["Errors"] = benchmark::Counter(errors); } - std::cout << "Time for 100% max early exits: " << time << std::endl; } +} - // input is min values - all min atomics will early exit - { - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { - (void)Kokkos::atomic_min_fetch(&(inp(i)), (T)i); - }); - Kokkos::fence(); - double time = timer.seconds(); - - int errors(0); - Kokkos::parallel_reduce( - length, - KOKKOS_LAMBDA(const int i, int& inner) { - T ref = min; - inner += (inp(i) != ref); - }, - errors); - Kokkos::fence(); - - if (errors) { - std::cerr << "Error in 100% min early exits: " << errors << std::endl; - std::cerr << "inp(0)=" << inp(0) << std::endl; - if (length > 9) std::cout << "inp(9)=" << inp(9) << std::endl; +template +static void Atomic_MinEarlyExits(benchmark::State& state) { + const auto length = get_length(state); + auto inp = prepare_input(length, std::numeric_limits::lowest()); + + for (auto _ : state) { + const auto time = atomic_min_replacement(inp); + const auto errors = + check_errors_early_exit(inp, std::numeric_limits::lowest()); + + // report results + state.SetIterationTime(time); + if (errors > 0) { + state.counters["Errors"] = benchmark::Counter(errors); } - std::cout << "Time for 100% min early exits: " << time << std::endl; } +} - // limit iterations for contentious test, takes ~50x longer for same length - auto con_length = length / 5; - // input is min values - some max atomics will replace - { - Kokkos::parallel_for( - 1, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); - Kokkos::fence(); - - T current(0); - timer.reset(); - Kokkos::parallel_reduce( - con_length, - KOKKOS_LAMBDA(const int i, T& inner) { - inner = Kokkos::atomic_max_fetch(&(inp(0)), inner + 1); - if (i == con_length - 1) { - Kokkos::atomic_max_fetch(&(inp(0)), max); - inner = max; - } - }, - Kokkos::Max(current)); - Kokkos::fence(); - double time = timer.seconds(); - - if (current < max) { - std::cerr << "Error in contentious max replacements: " << std::endl; - std::cerr << "final=" << current << " inp(0)=" << inp(0) << " max=" << max - << std::endl; - } - std::cout << "Time for contentious max " << con_length - << " replacements: " << time << std::endl; +template +void report_errors_contentious_replacement(benchmark::State& state, + const T final, const T first, + const T expected) { + state.counters["Errors"] = benchmark::Counter(1); + state.counters["Final"] = benchmark::Counter(final); + state.counters["First"] = benchmark::Counter(first); + state.counters["Expected"] = benchmark::Counter(expected); +} + +template +double atomic_contentious_max_replacement(benchmark::State& state, + Kokkos::View input, + const int con_length) { + const auto max = std::numeric_limits::max(); + T current = 0; + + Kokkos::Timer timer; + Kokkos::parallel_reduce( + con_length, + KOKKOS_LAMBDA(const int i, T& inner) { + inner = Kokkos::atomic_max_fetch(&(input(0)), inner + 1); + if (i == con_length - 1) { + Kokkos::atomic_max_fetch(&(input(0)), max); + inner = max; + } + }, + Kokkos::Max(current)); + Kokkos::fence(); + const auto time = timer.seconds(); + + if (current < max) { + report_errors_contentious_replacement(state, current, input(0), max); } - // input is max values - some min atomics will replace - { - Kokkos::parallel_for( - 1, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); - Kokkos::fence(); - - timer.reset(); - T current(100000000); - Kokkos::parallel_reduce( - con_length, - KOKKOS_LAMBDA(const int i, T& inner) { - inner = Kokkos::atomic_min_fetch(&(inp(0)), inner - 1); - if (i == con_length - 1) { - Kokkos::atomic_min_fetch(&(inp(0)), min); - inner = min; - } - }, - Kokkos::Min(current)); - Kokkos::fence(); - double time = timer.seconds(); - - if (current > min) { - std::cerr << "Error in contentious min replacements: " << std::endl; - std::cerr << "final=" << current << " inp(0)=" << inp(0) << " min=" << min - << std::endl; - } - std::cout << "Time for contentious min " << con_length - << " replacements: " << time << std::endl; + return time; +} + +template +static void Atomic_ContentiousMaxReplacements(benchmark::State& state) { + const auto length = get_length(state); + auto inp = prepare_input(1, std::numeric_limits::lowest()); + + for (auto _ : state) { + const auto time = atomic_contentious_max_replacement(state, inp, length); + + state.SetIterationTime(time); } } -int main(int argc, char* argv[]) { - Kokkos::initialize(argc, argv); - { - int length = 1000000; - if (argc == 2) { - length = std::stoi(argv[1]); - } +template +double atomic_contentious_min_replacement(benchmark::State& state, + Kokkos::View input, + const int con_length) { + const auto min = std::numeric_limits::lowest(); + T current = 0; - if (length < 1) { - throw std::invalid_argument(""); - } + Kokkos::Timer timer; + Kokkos::parallel_reduce( + con_length, + KOKKOS_LAMBDA(const int i, T& inner) { + inner = Kokkos::atomic_min_fetch(&(input(0)), inner - 1); + if (i == con_length - 1) { + Kokkos::atomic_min_fetch(&(input(0)), min); + inner = min; + } + }, + Kokkos::Min(current)); + Kokkos::fence(); + const auto time = timer.seconds(); + + if (current > min) { + report_errors_contentious_replacement(state, current, input(0), min); + } + + return time; +} - std::cout << "================ int" << std::endl; - test(length); - std::cout << "================ long" << std::endl; - test(length); - std::cout << "================ long long" << std::endl; - test(length); - - std::cout << "================ unsigned int" << std::endl; - test(length); - std::cout << "================ unsigned long" << std::endl; - test(length); - std::cout << "================ unsigned long long" << std::endl; - test(length); - - std::cout << "================ float" << std::endl; - test(length); - std::cout << "================ double" << std::endl; - test(length); +template +static void Atomic_ContentiousMinReplacements(benchmark::State& state) { + const auto length = get_length(state); + auto inp = prepare_input(1, std::numeric_limits::max()); + + for (auto _ : state) { + const auto time = atomic_contentious_max_replacement(state, inp, length); + + state.SetIterationTime(time); } +} + +BENCHMARK(Atomic_MinReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MinEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMinReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); + +// FIXME: duplicated +namespace Test { +int command_line_num_args(int n) { + static int n_args = 0; + if (n > 0) n_args = n; + return n_args; +} + +const char* command_line_arg(int k, char** input_args) { + static char** args; + if (input_args != nullptr) args = input_args; + if (command_line_num_args() > k) + return args[k]; + else + return nullptr; +} +} // namespace Test + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kSecond); + KokkosBenchmark::add_benchmark_context(true); + + (void)Test::command_line_num_args(argc); + (void)Test::command_line_arg(0, argv); + + benchmark::RunSpecifiedBenchmarks(); + + // std::cout << "================ int" << std::endl; + // test(length); + // std::cout << "================ long" << std::endl; + // test(length); + // std::cout << "================ long long" << std::endl; + // test(length); + + // std::cout << "================ unsigned int" << std::endl; + // test(length); + // std::cout << "================ unsigned long" << std::endl; + // test(length); + // std::cout << "================ unsigned long long" << std::endl; + // test(length); + + // std::cout << "================ float" << std::endl; + // test(length); + // std::cout << "================ double" << std::endl; + // test(length); + // } + + benchmark::Shutdown(); Kokkos::finalize(); return 0; } From 372d03e8bd2bc96fc6f9e36bcf30b166706fd3b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 29 Sep 2022 15:39:32 +0200 Subject: [PATCH 180/496] Fix units - Fill --- core/perf_test/PerfTest_ViewFill.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/core/perf_test/PerfTest_ViewFill.cpp b/core/perf_test/PerfTest_ViewFill.cpp index 255dd309dd..bd80ab1776 100644 --- a/core/perf_test/PerfTest_ViewFill.cpp +++ b/core/perf_test/PerfTest_ViewFill.cpp @@ -50,14 +50,15 @@ namespace Test { void report_results_fill(benchmark::State& state, double time) { state.SetIterationTime(time); - const auto N8 = std::pow(state.range(0), 8); - const auto size = N8 * 8 / 1024 / 1024; + const auto N8 = std::pow(state.range(0), 8); + // data size in megabytes + const auto size = N8 * 8 / 1000 / 1000; + // data processed in gigabytes + const auto data_processed = size / 1000; - state.counters["MB"] = benchmark::Counter(size, benchmark::Counter::kDefaults, - benchmark::Counter::OneK::kIs1024); - state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = - benchmark::Counter(size / 1024 / time, benchmark::Counter::kDefaults, - benchmark::Counter::OneK::kIs1024); + state.counters["MB"] = benchmark::Counter(size); + state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter( + data_processed, benchmark::Counter::kIsIterationInvariantRate); } } // namespace Test From 5635e1379c5d46005b50f46a50d37cdb9a9bdf57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 27 Sep 2022 18:02:06 +0200 Subject: [PATCH 181/496] Use common helper for reporting results --- core/perf_test/Benchmark_Context.hpp | 18 +++++++ core/perf_test/CMakeLists.txt | 2 - core/perf_test/PerfTest_ViewAllocate.cpp | 37 ++++++-------- core/perf_test/PerfTest_ViewFill.cpp | 64 ------------------------ core/perf_test/PerfTest_ViewFill.hpp | 10 ++-- core/perf_test/PerfTest_ViewResize.cpp | 64 ------------------------ core/perf_test/PerfTest_ViewResize.hpp | 36 +++++++------ 7 files changed, 55 insertions(+), 176 deletions(-) delete mode 100644 core/perf_test/PerfTest_ViewFill.cpp delete mode 100644 core/perf_test/PerfTest_ViewResize.cpp diff --git a/core/perf_test/Benchmark_Context.hpp b/core/perf_test/Benchmark_Context.hpp index 55d95d9395..a389827104 100644 --- a/core/perf_test/Benchmark_Context.hpp +++ b/core/perf_test/Benchmark_Context.hpp @@ -46,6 +46,24 @@ inline std::string benchmark_fom(const std::string& label) { return "FOM: " + label; } +/** + * \brief Report throughput and amount of data processed for simple View + * operations + */ +template +void report_results(benchmark::State& state, ViewType view, int data_ratio, + double time) { + // data processed in megabytes + const double data_processed = data_ratio * view.size() * + sizeof(typename ViewType::value_type) / + 1'000'000; + + state.SetIterationTime(time); + state.counters["MB"] = benchmark::Counter(data_processed); + state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter( + data_processed / 1'000, benchmark::Counter::kIsIterationInvariantRate); +} + } // namespace KokkosBenchmark #endif diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 6566ca9a76..40b3f89397 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -199,13 +199,11 @@ SET( PerfTest_ViewCopy_c8.cpp PerfTest_ViewCopy_d8.cpp PerfTest_ViewCopy_Raw.cpp - PerfTest_ViewFill.cpp PerfTest_ViewFill_123.cpp PerfTest_ViewFill_45.cpp PerfTest_ViewFill_6.cpp PerfTest_ViewFill_7.cpp PerfTest_ViewFill_8.cpp - PerfTest_ViewResize.cpp PerfTest_ViewResize_123.cpp PerfTest_ViewResize_45.cpp PerfTest_ViewResize_6.cpp diff --git a/core/perf_test/PerfTest_ViewAllocate.cpp b/core/perf_test/PerfTest_ViewAllocate.cpp index 3129f99ee7..0122c5f96b 100644 --- a/core/perf_test/PerfTest_ViewAllocate.cpp +++ b/core/perf_test/PerfTest_ViewAllocate.cpp @@ -22,18 +22,6 @@ namespace Test { static constexpr int N = 10; -void report_results_allocate(benchmark::State& state, double time) { - state.SetIterationTime(time); - const auto N8 = std::pow(state.range(0), 8); - const auto size = 1.0 * N8 * 8 / 1024 / 1024; - - state.counters["MB"] = benchmark::Counter(size, benchmark::Counter::kDefaults, - benchmark::Counter::OneK::kIs1024); - state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = - benchmark::Counter(size / 1024 / time, benchmark::Counter::kDefaults, - benchmark::Counter::OneK::kIs1024); -} - template static void ViewAllocate_Rank1(benchmark::State& state) { const int N8 = std::pow(state.range(0), 8); @@ -41,7 +29,7 @@ static void ViewAllocate_Rank1(benchmark::State& state) { for (auto _ : state) { Kokkos::Timer timer; Kokkos::View a("A1", N8); - report_results_allocate(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 1, timer.seconds()); } } @@ -52,7 +40,7 @@ static void ViewAllocate_Rank2(benchmark::State& state) { for (auto _ : state) { Kokkos::Timer timer; Kokkos::View a("A2", N4, N4); - report_results_allocate(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 1, timer.seconds()); } } @@ -64,7 +52,7 @@ static void ViewAllocate_Rank3(benchmark::State& state) { for (auto _ : state) { Kokkos::Timer timer; Kokkos::View a("A3", N3, N3, N2); - report_results_allocate(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 1, timer.seconds()); } } @@ -75,7 +63,7 @@ static void ViewAllocate_Rank4(benchmark::State& state) { for (auto _ : state) { Kokkos::Timer timer; Kokkos::View a("A4", N2, N2, N2, N2); - report_results_allocate(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 1, timer.seconds()); } } @@ -87,7 +75,7 @@ static void ViewAllocate_Rank5(benchmark::State& state) { for (auto _ : state) { Kokkos::Timer timer; Kokkos::View a("A5", N2, N2, N1, N1, N2); - report_results_allocate(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 1, timer.seconds()); } } @@ -99,7 +87,7 @@ static void ViewAllocate_Rank6(benchmark::State& state) { for (auto _ : state) { Kokkos::Timer timer; Kokkos::View a("A6", N2, N1, N1, N1, N1, N2); - report_results_allocate(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 1, timer.seconds()); } } @@ -111,7 +99,7 @@ static void ViewAllocate_Rank7(benchmark::State& state) { for (auto _ : state) { Kokkos::Timer timer; Kokkos::View a("A7", N2, N1, N1, N1, N1, N1, N1); - report_results_allocate(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 1, timer.seconds()); } } @@ -123,7 +111,7 @@ static void ViewAllocate_Rank8(benchmark::State& state) { Kokkos::Timer timer; Kokkos::View a("A8", N1, N1, N1, N1, N1, N1, N1, N1); - report_results_allocate(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 1, timer.seconds()); } } @@ -138,7 +126,14 @@ static void ViewAllocate_Raw(benchmark::State& state) { N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; }); Kokkos::fence(); Kokkos::kokkos_free(a_ptr); - report_results_allocate(state, timer.seconds()); + + state.SetIterationTime(timer.seconds()); + // data processed in megabytes + const double data_processed = 1 * N8 * sizeof(double) / 1'000'000; + + state.counters["MB"] = benchmark::Counter(data_processed); + state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter( + data_processed / 1'000, benchmark::Counter::kIsIterationInvariantRate); } } diff --git a/core/perf_test/PerfTest_ViewFill.cpp b/core/perf_test/PerfTest_ViewFill.cpp deleted file mode 100644 index bd80ab1776..0000000000 --- a/core/perf_test/PerfTest_ViewFill.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#include - -#include - -namespace Test { - -void report_results_fill(benchmark::State& state, double time) { - state.SetIterationTime(time); - const auto N8 = std::pow(state.range(0), 8); - // data size in megabytes - const auto size = N8 * 8 / 1000 / 1000; - // data processed in gigabytes - const auto data_processed = size / 1000; - - state.counters["MB"] = benchmark::Counter(size); - state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter( - data_processed, benchmark::Counter::kIsIterationInvariantRate); -} - -} // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill.hpp b/core/perf_test/PerfTest_ViewFill.hpp index da073464c1..adceebbc81 100644 --- a/core/perf_test/PerfTest_ViewFill.hpp +++ b/core/perf_test/PerfTest_ViewFill.hpp @@ -14,14 +14,12 @@ // //@HEADER -#include -#include +#include + #include namespace Test { -void report_results_fill(benchmark::State& state, double time); - template void fill_view(ViewType& a, typename ViewType::const_value_type& val, benchmark::State& state) { @@ -29,7 +27,7 @@ void fill_view(ViewType& a, typename ViewType::const_value_type& val, Kokkos::fence(); Kokkos::Timer timer; Kokkos::deep_copy(a, val); - report_results_fill(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 1, timer.seconds()); } } @@ -121,7 +119,7 @@ static void ViewFill_Raw(benchmark::State& state) { N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; }); Kokkos::fence(); - report_results_fill(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 1, timer.seconds()); } } diff --git a/core/perf_test/PerfTest_ViewResize.cpp b/core/perf_test/PerfTest_ViewResize.cpp deleted file mode 100644 index 5715b19fa2..0000000000 --- a/core/perf_test/PerfTest_ViewResize.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#include - -#include - -namespace Test { - -void report_results_resize(benchmark::State& state, double time) { - state.SetIterationTime(time); - const auto N8 = std::pow(state.range(0), 8); - // data size in megabytes - const auto size = N8 * 8 / 1000 / 1000; - // data processed in gigabytes - const auto data_processed = 2.0 * size / 1000; - - state.counters["MB"] = benchmark::Counter(size); - state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter( - data_processed, benchmark::Counter::kIsIterationInvariantRate); -} - -} // namespace Test diff --git a/core/perf_test/PerfTest_ViewResize.hpp b/core/perf_test/PerfTest_ViewResize.hpp index 60c866df51..2db3c4649b 100644 --- a/core/perf_test/PerfTest_ViewResize.hpp +++ b/core/perf_test/PerfTest_ViewResize.hpp @@ -21,8 +21,6 @@ namespace Test { -void report_results_resize(benchmark::State& state, double time); - template static void ViewResize_Rank1(benchmark::State& state) { const int N8 = std::pow(state.range(0), 8); @@ -34,7 +32,7 @@ static void ViewResize_Rank1(benchmark::State& state) { Kokkos::Timer timer; Kokkos::resize(a_, int(N8 * 1.1)); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -49,7 +47,7 @@ static void ViewResize_Rank2(benchmark::State& state) { Kokkos::Timer timer; Kokkos::resize(a_, int(N4 * 1.1), N4); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -65,7 +63,7 @@ static void ViewResize_Rank3(benchmark::State& state) { Kokkos::Timer timer; Kokkos::resize(a_, int(N3 * 1.1), N3, N2); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -80,7 +78,7 @@ static void ViewResize_Rank4(benchmark::State& state) { Kokkos::Timer timer; Kokkos::resize(a_, int(N2 * 1.1), N2, N2, N2); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -97,7 +95,7 @@ static void ViewResize_Rank5(benchmark::State& state) { Kokkos::Timer timer; Kokkos::resize(a_, int(N2 * 1.1), N2, N1, N1, N2); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -114,7 +112,7 @@ static void ViewResize_Rank6(benchmark::State& state) { Kokkos::Timer timer; Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N2); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -131,7 +129,7 @@ static void ViewResize_Rank7(benchmark::State& state) { Kokkos::Timer timer; Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N1, N1); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -147,7 +145,7 @@ static void ViewResize_Rank8(benchmark::State& state) { Kokkos::Timer timer; Kokkos::resize(a_, int(N1 * 1.1), N1, N1, N1, N1, N1, N1, N1); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -162,7 +160,7 @@ static void ViewResize_NoInit_Rank1(benchmark::State& state) { Kokkos::Timer timer; Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N8 * 1.1)); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -177,7 +175,7 @@ static void ViewResize_NoInit_Rank2(benchmark::State& state) { Kokkos::Timer timer; Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N4 * 1.1), N4); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -193,7 +191,7 @@ static void ViewResize_NoInit_Rank3(benchmark::State& state) { Kokkos::Timer timer; Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N3 * 1.1), N3, N2); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -208,7 +206,7 @@ static void ViewResize_NoInit_Rank4(benchmark::State& state) { Kokkos::Timer timer; Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N2, N2); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -226,7 +224,7 @@ static void ViewResize_NoInit_Rank5(benchmark::State& state) { Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N1, N1, N2); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -244,7 +242,7 @@ static void ViewResize_NoInit_Rank6(benchmark::State& state) { Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1, N1, N2); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -262,7 +260,7 @@ static void ViewResize_NoInit_Rank7(benchmark::State& state) { Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1, N1, N1, N1); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -279,7 +277,7 @@ static void ViewResize_NoInit_Rank8(benchmark::State& state) { Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N1 * 1.1), N1, N1, N1, N1, N1, N1, N1); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } @@ -297,7 +295,7 @@ static void ViewResize_NoInit_Raw(benchmark::State& state) { Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); Kokkos::fence(); - report_results_resize(state, timer.seconds()); + KokkosBenchmark::report_results(state, a, 2, timer.seconds()); } } From 25876cfccb5afd7f403edd44272f54a7567c4e75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 26 Sep 2022 17:55:28 +0200 Subject: [PATCH 182/496] Port Custom Reduction tests --- core/perf_test/CMakeLists.txt | 5 +- core/perf_test/PerfTest_CustomReduction.cpp | 107 ++++++++++++-------- 2 files changed, 68 insertions(+), 44 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 40b3f89397..e633c21385 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -11,9 +11,9 @@ IF(KOKKOS_ENABLE_TESTS) SET(SOURCES PerfTestMain.cpp PerfTestGramSchmidt.cpp - PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp - ) + ) + IF(Kokkos_ENABLE_OPENMPTARGET) # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction @@ -176,6 +176,7 @@ SET( BENCHMARK_SOURCES BenchmarkMain.cpp Benchmark_Context.cpp + PerfTest_CustomReduction.cpp PerfTestHexGrad.cpp PerfTest_ViewAllocate.cpp PerfTest_ViewCopy_a123.cpp diff --git a/core/perf_test/PerfTest_CustomReduction.cpp b/core/perf_test/PerfTest_CustomReduction.cpp index 049301f9a7..2fdab006e9 100644 --- a/core/perf_test/PerfTest_CustomReduction.cpp +++ b/core/perf_test/PerfTest_CustomReduction.cpp @@ -15,14 +15,16 @@ //@HEADER #include -#include +#include +#include "Benchmark_Context.hpp" #include #include +#include #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA namespace Test { template -void custom_reduction_test(int N, int R, int num_trials) { +std::pair custom_reduction_test(int N, int R) { Kokkos::Random_XorShift64_Pool<> rand_pool(183291); Kokkos::View a("A", N); Kokkos::fill_random(a, rand_pool, 1.0); @@ -62,49 +64,70 @@ void custom_reduction_test(int N, int R, int num_trials) { // Timing Kokkos::Timer timer; - for (int r = 0; r < num_trials; r++) { - Kokkos::parallel_reduce( - Kokkos::TeamPolicy<>(N / 1024, team_size), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team, - Scalar& lmax) { - Scalar team_max = Scalar(0); - for (int rr = 0; rr < R; rr++) { - int i = team.league_rank(); - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, 32), - [&](const int& j, Scalar& thread_max) { - Scalar t_max = Scalar(0); - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, 32), - [&](const int& k, Scalar& max_) { - const Scalar val = a((i * 32 + j) * 32 + k); - if (val > max_) max_ = val; - if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5; - }, - Kokkos::Max(t_max)); - if (t_max > thread_max) thread_max = t_max; - }, - Kokkos::Max(team_max)); - } - if (team_max > lmax) lmax = team_max; - }, - Kokkos::Max(max)); - } - double time = timer.seconds(); - printf("%e %e %e\n", time, - 1.0 * N * R * num_trials * sizeof(Scalar) / time / 1024 / 1024 / 1024, - max); + Kokkos::parallel_reduce( + Kokkos::TeamPolicy<>(N / 1024, team_size), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team, + Scalar& lmax) { + Scalar team_max = Scalar(0); + for (int rr = 0; rr < R; rr++) { + int i = team.league_rank(); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, 32), + [&](const int& j, Scalar& thread_max) { + Scalar t_max = Scalar(0); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, 32), + [&](const int& k, Scalar& max_) { + const Scalar val = a((i * 32 + j) * 32 + k); + if (val > max_) max_ = val; + if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5; + }, + Kokkos::Max(t_max)); + if (t_max > thread_max) thread_max = t_max; + }, + Kokkos::Max(team_max)); + } + if (team_max > lmax) lmax = team_max; + }, + Kokkos::Max(max)); + + return std::make_pair(timer.seconds(), max); } -TEST(default_exec, custom_reduction) { - int N = 100000; - int R = 1000; - int num_trials = 1; +int get_N(benchmark::State& state) { + return (Test::command_line_num_args() > 1) + ? std::stoi(Test::command_line_arg(1)) + : state.range(0); +} - if (command_line_num_args() > 1) N = std::stoi(command_line_arg(1)); - if (command_line_num_args() > 2) R = std::stoi(command_line_arg(2)); - if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3)); - custom_reduction_test(N, R, num_trials); +int get_R(benchmark::State& state) { + return (Test::command_line_num_args() > 2) + ? std::stoi(Test::command_line_arg(2)) + : state.range(1); } + +template +static void CustomReduction(benchmark::State& state) { + int N = get_N(state); + int R = get_R(state); + + for (auto _ : state) { + auto results = custom_reduction_test(N, R); + // data processed in gigabytes + const double data_processed = + N * R * sizeof(Scalar) / results.first / 1'000'000'000; + + state.SetIterationTime(results.first); + state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter( + data_processed, benchmark::Counter::kIsIterationInvariantRate); + state.counters["Max"] = benchmark::Counter(results.second); + } +} + +BENCHMARK(CustomReduction) + ->ArgNames({"N", "R"}) + ->Args({100'000, 1'000}) + ->UseManualTime(); + } // namespace Test #endif From b1a3135d1c5b3ce175277290f718830922207a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 10 Nov 2022 23:12:02 +0100 Subject: [PATCH 183/496] Reduce repetition in ViewResize benchmarks --- core/perf_test/CMakeLists.txt | 1 + core/perf_test/PerfTest_ViewResize.hpp | 3 ++ core/perf_test/PerfTest_ViewResize_123.cpp | 40 +++++--------- core/perf_test/PerfTest_ViewResize_45.cpp | 11 ++-- core/perf_test/PerfTest_ViewResize_6.cpp | 3 -- core/perf_test/PerfTest_ViewResize_7.cpp | 3 -- core/perf_test/PerfTest_ViewResize_8.cpp | 14 +++-- core/perf_test/PerfTest_ViewResize_Raw.cpp | 63 ++++++++++++++++++++++ 8 files changed, 89 insertions(+), 49 deletions(-) create mode 100644 core/perf_test/PerfTest_ViewResize_Raw.cpp diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index e633c21385..3a27619628 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -210,6 +210,7 @@ SET( PerfTest_ViewResize_6.cpp PerfTest_ViewResize_7.cpp PerfTest_ViewResize_8.cpp + PerfTest_ViewResize_Raw.cpp ) KOKKOS_ADD_BENCHMARK( diff --git a/core/perf_test/PerfTest_ViewResize.hpp b/core/perf_test/PerfTest_ViewResize.hpp index 2db3c4649b..4f6a316803 100644 --- a/core/perf_test/PerfTest_ViewResize.hpp +++ b/core/perf_test/PerfTest_ViewResize.hpp @@ -21,6 +21,9 @@ namespace Test { +static constexpr int R = 10; +static constexpr int N = 10; + template static void ViewResize_Rank1(benchmark::State& state) { const int N8 = std::pow(state.range(0), 8); diff --git a/core/perf_test/PerfTest_ViewResize_123.cpp b/core/perf_test/PerfTest_ViewResize_123.cpp index 35b48523bb..0f5fceb5aa 100644 --- a/core/perf_test/PerfTest_ViewResize_123.cpp +++ b/core/perf_test/PerfTest_ViewResize_123.cpp @@ -18,92 +18,76 @@ namespace Test { -static constexpr int R = 10; - BENCHMARK(ViewResize_Rank1) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_Rank1) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_Rank2) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_Rank2) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_Rank3) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_Rank3) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank1) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank1) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank2) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank2) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank3) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank3) ->ArgName("N") - ->Arg(10) - ->UseManualTime() - ->Iterations(R); - -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) -BENCHMARK(ViewResize_NoInit_Raw) - ->ArgName("N") - ->Arg(10) - ->UseManualTime() - ->Iterations(R); - -BENCHMARK(ViewResize_NoInit_Raw) - ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); -#endif } // namespace Test diff --git a/core/perf_test/PerfTest_ViewResize_45.cpp b/core/perf_test/PerfTest_ViewResize_45.cpp index 76a5209209..aba09a5d21 100644 --- a/core/perf_test/PerfTest_ViewResize_45.cpp +++ b/core/perf_test/PerfTest_ViewResize_45.cpp @@ -18,9 +18,6 @@ namespace Test { -static constexpr int R = 10; -static constexpr int N = 10; - BENCHMARK(ViewResize_Rank4) ->ArgName("N") ->Arg(N) @@ -47,25 +44,25 @@ BENCHMARK(ViewResize_Rank5) BENCHMARK(ViewResize_NoInit_Rank4) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank4) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank5) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank5) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); diff --git a/core/perf_test/PerfTest_ViewResize_6.cpp b/core/perf_test/PerfTest_ViewResize_6.cpp index 623a39e637..811d7c4ffc 100644 --- a/core/perf_test/PerfTest_ViewResize_6.cpp +++ b/core/perf_test/PerfTest_ViewResize_6.cpp @@ -18,9 +18,6 @@ namespace Test { -static constexpr int R = 10; -static constexpr int N = 10; - BENCHMARK(ViewResize_Rank6) ->ArgName("N") ->Arg(N) diff --git a/core/perf_test/PerfTest_ViewResize_7.cpp b/core/perf_test/PerfTest_ViewResize_7.cpp index 9ecf320e64..51863b0653 100644 --- a/core/perf_test/PerfTest_ViewResize_7.cpp +++ b/core/perf_test/PerfTest_ViewResize_7.cpp @@ -18,9 +18,6 @@ namespace Test { -static constexpr int R = 10; -static constexpr int N = 10; - BENCHMARK(ViewResize_Rank7) ->ArgName("N") ->Arg(N) diff --git a/core/perf_test/PerfTest_ViewResize_8.cpp b/core/perf_test/PerfTest_ViewResize_8.cpp index c213160102..3e2d610cdd 100644 --- a/core/perf_test/PerfTest_ViewResize_8.cpp +++ b/core/perf_test/PerfTest_ViewResize_8.cpp @@ -18,36 +18,34 @@ namespace Test { -static constexpr int R = 10; - // FIXME_SYCL Avoid running out of resources on the CUDA GPU used in the CI #ifdef KOKKOS_ENABLE_SYCL -static constexpr int N = 9; +static constexpr int N_8 = N - 1; #else -static constexpr int N = 10; +static constexpr int N_8 = N; #endif BENCHMARK(ViewResize_Rank8) ->ArgName("N") - ->Arg(N) + ->Arg(N_8) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_Rank8) ->ArgName("N") - ->Arg(N) + ->Arg(N_8) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank8) ->ArgName("N") - ->Arg(10) + ->Arg(N_8) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank8) ->ArgName("N") - ->Arg(10) + ->Arg(N_8) ->UseManualTime() ->Iterations(R); diff --git a/core/perf_test/PerfTest_ViewResize_Raw.cpp b/core/perf_test/PerfTest_ViewResize_Raw.cpp new file mode 100644 index 0000000000..da4cddc946 --- /dev/null +++ b/core/perf_test/PerfTest_ViewResize_Raw.cpp @@ -0,0 +1,63 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +namespace Test { + +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) +BENCHMARK(ViewResize_NoInit_Raw) + ->ArgName("N") + ->Arg(N) + ->UseManualTime() + ->Iterations(R); + +BENCHMARK(ViewResize_NoInit_Raw) + ->ArgName("N") + ->Arg(N) + ->UseManualTime() + ->Iterations(R); +#endif + +} // namespace Test From 924600bf7902c33d6fe271a9db9e65a96edae021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 10 Nov 2022 23:20:55 +0100 Subject: [PATCH 184/496] Reduce repetition in ViewFill benchmarks --- core/perf_test/CMakeLists.txt | 1 + core/perf_test/PerfTest_ViewFill.hpp | 2 + core/perf_test/PerfTest_ViewFill_123.cpp | 12 ++--- core/perf_test/PerfTest_ViewFill_45.cpp | 8 ++-- core/perf_test/PerfTest_ViewFill_6.cpp | 4 +- core/perf_test/PerfTest_ViewFill_7.cpp | 4 +- core/perf_test/PerfTest_ViewFill_8.cpp | 16 +------ core/perf_test/PerfTest_ViewFill_Raw.cpp | 61 ++++++++++++++++++++++++ 8 files changed, 80 insertions(+), 28 deletions(-) create mode 100644 core/perf_test/PerfTest_ViewFill_Raw.cpp diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 3a27619628..13f083f3d8 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -205,6 +205,7 @@ SET( PerfTest_ViewFill_6.cpp PerfTest_ViewFill_7.cpp PerfTest_ViewFill_8.cpp + PerfTest_ViewFill_Raw.cpp PerfTest_ViewResize_123.cpp PerfTest_ViewResize_45.cpp PerfTest_ViewResize_6.cpp diff --git a/core/perf_test/PerfTest_ViewFill.hpp b/core/perf_test/PerfTest_ViewFill.hpp index adceebbc81..9ac0be467e 100644 --- a/core/perf_test/PerfTest_ViewFill.hpp +++ b/core/perf_test/PerfTest_ViewFill.hpp @@ -20,6 +20,8 @@ namespace Test { +static constexpr int N = 10; + template void fill_view(ViewType& a, typename ViewType::const_value_type& val, benchmark::State& state) { diff --git a/core/perf_test/PerfTest_ViewFill_123.cpp b/core/perf_test/PerfTest_ViewFill_123.cpp index 9ef2afeca4..d04f1fbea2 100644 --- a/core/perf_test/PerfTest_ViewFill_123.cpp +++ b/core/perf_test/PerfTest_ViewFill_123.cpp @@ -20,32 +20,32 @@ namespace Test { BENCHMARK(ViewFill_Rank1) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); BENCHMARK(ViewFill_Rank1) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); BENCHMARK(ViewFill_Rank2) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); BENCHMARK(ViewFill_Rank2) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); BENCHMARK(ViewFill_Rank3) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); BENCHMARK(ViewFill_Rank3) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); } // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill_45.cpp b/core/perf_test/PerfTest_ViewFill_45.cpp index 2d5e75022b..90e597cc11 100644 --- a/core/perf_test/PerfTest_ViewFill_45.cpp +++ b/core/perf_test/PerfTest_ViewFill_45.cpp @@ -20,22 +20,22 @@ namespace Test { BENCHMARK(ViewFill_Rank4) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); BENCHMARK(ViewFill_Rank4) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); BENCHMARK(ViewFill_Rank5) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); BENCHMARK(ViewFill_Rank5) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); } // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill_6.cpp b/core/perf_test/PerfTest_ViewFill_6.cpp index 640c46ca89..b99e5a4eb3 100644 --- a/core/perf_test/PerfTest_ViewFill_6.cpp +++ b/core/perf_test/PerfTest_ViewFill_6.cpp @@ -20,12 +20,12 @@ namespace Test { BENCHMARK(ViewFill_Rank6) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); BENCHMARK(ViewFill_Rank6) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); } // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill_7.cpp b/core/perf_test/PerfTest_ViewFill_7.cpp index 025ddafe9b..99dacfdbb2 100644 --- a/core/perf_test/PerfTest_ViewFill_7.cpp +++ b/core/perf_test/PerfTest_ViewFill_7.cpp @@ -20,12 +20,12 @@ namespace Test { BENCHMARK(ViewFill_Rank7) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); BENCHMARK(ViewFill_Rank7) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); } // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill_8.cpp b/core/perf_test/PerfTest_ViewFill_8.cpp index 0fe733e534..0093e50100 100644 --- a/core/perf_test/PerfTest_ViewFill_8.cpp +++ b/core/perf_test/PerfTest_ViewFill_8.cpp @@ -20,24 +20,12 @@ namespace Test { BENCHMARK(ViewFill_Rank8) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); BENCHMARK(ViewFill_Rank8) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime(); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) -BENCHMARK(ViewFill_Raw) - ->ArgName("N") - ->Arg(10) - ->UseManualTime(); - -BENCHMARK(ViewFill_Raw) - ->ArgName("N") - ->Arg(10) - ->UseManualTime(); -#endif - } // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill_Raw.cpp b/core/perf_test/PerfTest_ViewFill_Raw.cpp new file mode 100644 index 0000000000..d530aa4bb0 --- /dev/null +++ b/core/perf_test/PerfTest_ViewFill_Raw.cpp @@ -0,0 +1,61 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +namespace Test { + +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) +BENCHMARK(ViewFill_Raw) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); + +BENCHMARK(ViewFill_Raw) + ->ArgName("N") + ->Arg(N) + ->UseManualTime(); +#endif + +} // namespace Test From 076d9318996764e38cd03009ee9abb8572c31eda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 14 Nov 2022 22:10:25 +0100 Subject: [PATCH 185/496] Use named constants --- core/perf_test/PerfTest_ViewResize_6.cpp | 4 ++-- core/perf_test/PerfTest_ViewResize_7.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core/perf_test/PerfTest_ViewResize_6.cpp b/core/perf_test/PerfTest_ViewResize_6.cpp index 811d7c4ffc..7d14315d84 100644 --- a/core/perf_test/PerfTest_ViewResize_6.cpp +++ b/core/perf_test/PerfTest_ViewResize_6.cpp @@ -32,13 +32,13 @@ BENCHMARK(ViewResize_Rank6) BENCHMARK(ViewResize_NoInit_Rank6) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank6) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); diff --git a/core/perf_test/PerfTest_ViewResize_7.cpp b/core/perf_test/PerfTest_ViewResize_7.cpp index 51863b0653..27dc9f1598 100644 --- a/core/perf_test/PerfTest_ViewResize_7.cpp +++ b/core/perf_test/PerfTest_ViewResize_7.cpp @@ -32,13 +32,13 @@ BENCHMARK(ViewResize_Rank7) BENCHMARK(ViewResize_NoInit_Rank7) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); BENCHMARK(ViewResize_NoInit_Rank7) ->ArgName("N") - ->Arg(10) + ->Arg(N) ->UseManualTime() ->Iterations(R); From 5534a8ff50ff957e6b386ebe4f3e87a8b23cae0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 14 Nov 2022 22:12:30 +0100 Subject: [PATCH 186/496] Remove redundant include --- core/perf_test/PerfTest_ViewCopy_Raw.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/perf_test/PerfTest_ViewCopy_Raw.cpp b/core/perf_test/PerfTest_ViewCopy_Raw.cpp index 66f0793283..67a8d7e555 100644 --- a/core/perf_test/PerfTest_ViewCopy_Raw.cpp +++ b/core/perf_test/PerfTest_ViewCopy_Raw.cpp @@ -16,8 +16,6 @@ #include "PerfTest_ViewCopy.hpp" -#include - namespace Test { #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) From e0b5846cd16e2574b88f266e9724787b4f29a3bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 14 Nov 2022 22:18:03 +0100 Subject: [PATCH 187/496] Measure only allocation time --- core/perf_test/PerfTest_ViewAllocate.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/perf_test/PerfTest_ViewAllocate.cpp b/core/perf_test/PerfTest_ViewAllocate.cpp index 0122c5f96b..bffcca17aa 100644 --- a/core/perf_test/PerfTest_ViewAllocate.cpp +++ b/core/perf_test/PerfTest_ViewAllocate.cpp @@ -125,13 +125,13 @@ static void ViewAllocate_Raw(benchmark::State& state) { Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; }); Kokkos::fence(); + const auto time = timer.seconds(); Kokkos::kokkos_free(a_ptr); - state.SetIterationTime(timer.seconds()); + state.SetIterationTime(time); // data processed in megabytes const double data_processed = 1 * N8 * sizeof(double) / 1'000'000; - - state.counters["MB"] = benchmark::Counter(data_processed); + state.counters["MB"] = benchmark::Counter(data_processed); state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter( data_processed / 1'000, benchmark::Counter::kIsIterationInvariantRate); } From 7dd33f81bfd87cc4f9664380680d16c372003887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 15 Nov 2022 17:28:45 +0100 Subject: [PATCH 188/496] Remove ported benchmarks from Makefile --- core/perf_test/Makefile | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/core/perf_test/Makefile b/core/perf_test/Makefile index 396387c622..8d574a40b7 100644 --- a/core/perf_test/Makefile +++ b/core/perf_test/Makefile @@ -14,7 +14,7 @@ else CXX = g++ endif -CXXFLAGS = -O3 +CXXFLAGS = -O3 #CXXFLAGS += -DGENERIC_REDUCER LINK ?= $(CXX) LDFLAGS ?= @@ -29,43 +29,24 @@ TARGETS = # -OBJ_PERF = PerfTestMain.o gtest-all.o -OBJ_PERF += PerfTest_ExecSpacePartitioning.o -OBJ_PERF += PerfTestGramSchmidt.o -OBJ_PERF += PerfTestHexGrad.o -OBJ_PERF += PerfTest_CustomReduction.o -OBJ_PERF += PerfTest_ViewAllocate.o -OBJ_PERF += PerfTest_ViewFill_123.o PerfTest_ViewFill_45.o PerfTest_ViewFill_6.o PerfTest_ViewFill_7.o PerfTest_ViewFill_8.o -OBJ_PERF += PerfTest_ViewResize_123.o PerfTest_ViewResize_45.o PerfTest_ViewResize_6.o PerfTest_ViewResize_7.o PerfTest_ViewResize_8.o -TARGETS += KokkosCore_PerformanceTest -TEST_TARGETS += test-performance - -# - -OBJ_ATOMICS = test_atomic.o +OBJ_ATOMICS = test_atomic.o TARGETS += KokkosCore_PerformanceTest_Atomics TEST_TARGETS += test-atomic # -OBJ_MEMPOOL = test_mempool.o +OBJ_MEMPOOL = test_mempool.o TARGETS += KokkosCore_PerformanceTest_Mempool TEST_TARGETS += test-mempool # -OBJ_TASKDAG = test_taskdag.o +OBJ_TASKDAG = test_taskdag.o TARGETS += KokkosCore_PerformanceTest_TaskDAG TEST_TARGETS += test-taskdag # -OBJ_ATOMICS_MINMAX = test_atomic_minmax_simple.o -TARGETS += KokkosCore_PerformanceTest_Atomics_MinMax -TEST_TARGETS += test-atomic-minmax - -# - KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest From e250ce3b6215a87bde9db2cea317308e1bf477d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 17 Nov 2022 22:51:30 +0100 Subject: [PATCH 189/496] Move command line helpers implementation into a header --- core/perf_test/PerfTestMain.cpp | 19 +------------------ core/perf_test/PerfTest_Category.hpp | 16 ++++++++++++++-- core/perf_test/test_atomic_minmax_simple.cpp | 18 ------------------ 3 files changed, 15 insertions(+), 38 deletions(-) diff --git a/core/perf_test/PerfTestMain.cpp b/core/perf_test/PerfTestMain.cpp index 2729432adc..7315f26e5c 100644 --- a/core/perf_test/PerfTestMain.cpp +++ b/core/perf_test/PerfTestMain.cpp @@ -18,24 +18,7 @@ #include #include - -namespace Test { -int command_line_num_args(int n = 0) { - static int n_args = 0; - if (n > 0) n_args = n; - return n_args; -} - -const char* command_line_arg(int k, char** input_args = nullptr) { - static char** args; - if (input_args != nullptr) args = input_args; - if (command_line_num_args() > k) - return args[k]; - else - return nullptr; -} - -} // namespace Test +#include int main(int argc, char* argv[]) { ::testing::InitGoogleTest(&argc, argv); diff --git a/core/perf_test/PerfTest_Category.hpp b/core/perf_test/PerfTest_Category.hpp index 0cfbea9dd3..60f76ea8f5 100644 --- a/core/perf_test/PerfTest_Category.hpp +++ b/core/perf_test/PerfTest_Category.hpp @@ -19,8 +19,20 @@ namespace Test { -extern int command_line_num_args(int n = 0); -extern const char* command_line_arg(int k, char** input_args = nullptr); +inline int command_line_num_args(int n = 0) { + static int n_args = 0; + if (n > 0) n_args = n; + return n_args; +} + +inline const char* command_line_arg(int k, char** input_args = nullptr) { + static char** args; + if (input_args != nullptr) args = input_args; + if (command_line_num_args() > k) + return args[k]; + else + return nullptr; +} } // namespace Test diff --git a/core/perf_test/test_atomic_minmax_simple.cpp b/core/perf_test/test_atomic_minmax_simple.cpp index f4fde65848..63f3b4fb6f 100644 --- a/core/perf_test/test_atomic_minmax_simple.cpp +++ b/core/perf_test/test_atomic_minmax_simple.cpp @@ -288,24 +288,6 @@ BENCHMARK(Atomic_ContentiousMinReplacements) ->UseManualTime() ->Iterations(10); -// FIXME: duplicated -namespace Test { -int command_line_num_args(int n) { - static int n_args = 0; - if (n > 0) n_args = n; - return n_args; -} - -const char* command_line_arg(int k, char** input_args) { - static char** args; - if (input_args != nullptr) args = input_args; - if (command_line_num_args() > k) - return args[k]; - else - return nullptr; -} -} // namespace Test - int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); benchmark::Initialize(&argc, argv); From b6c619ae936360453101a447309eb18631f228fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 18 Nov 2022 23:45:49 +0100 Subject: [PATCH 190/496] Add missing tests to Atomic minmax benchmark --- core/perf_test/test_atomic_minmax_simple.cpp | 290 +++++++++++++++++-- 1 file changed, 269 insertions(+), 21 deletions(-) diff --git a/core/perf_test/test_atomic_minmax_simple.cpp b/core/perf_test/test_atomic_minmax_simple.cpp index 63f3b4fb6f..f6b9ce13ff 100644 --- a/core/perf_test/test_atomic_minmax_simple.cpp +++ b/core/perf_test/test_atomic_minmax_simple.cpp @@ -29,7 +29,7 @@ using exec_space = Kokkos::DefaultExecutionSpace; -constexpr int LENGTH = 1000000; +constexpr int LENGTH = 1'000'000; template Kokkos::View prepare_input(const int length, const T value) { @@ -252,6 +252,7 @@ static void Atomic_ContentiousMinReplacements(benchmark::State& state) { } } +// int BENCHMARK(Atomic_MinReplacements) ->ArgName("Length") ->Arg(LENGTH) @@ -287,6 +288,273 @@ BENCHMARK(Atomic_ContentiousMinReplacements) ->Arg(LENGTH / 5) ->UseManualTime() ->Iterations(10); +/////////////////////////////////////////////////////////////////////// + +// long +BENCHMARK(Atomic_MinReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MinEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMinReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); +/////////////////////////////////////////////////////////////////////// + +// long long +BENCHMARK(Atomic_MinReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MinEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMinReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); +/////////////////////////////////////////////////////////////////////// + +// unsigned int +BENCHMARK(Atomic_MinReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MinEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMinReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); +/////////////////////////////////////////////////////////////////////// + +// unsigned long +BENCHMARK(Atomic_MinReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MinEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMinReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); +/////////////////////////////////////////////////////////////////////// + +// unsigned long long +BENCHMARK(Atomic_MinReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MinEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMinReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); +/////////////////////////////////////////////////////////////////////// + +// float +BENCHMARK(Atomic_MinReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MinEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMinReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); +/////////////////////////////////////////////////////////////////////// + +// double +BENCHMARK(Atomic_MinReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MinEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMinReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); +/////////////////////////////////////////////////////////////////////// int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); @@ -299,26 +567,6 @@ int main(int argc, char* argv[]) { benchmark::RunSpecifiedBenchmarks(); - // std::cout << "================ int" << std::endl; - // test(length); - // std::cout << "================ long" << std::endl; - // test(length); - // std::cout << "================ long long" << std::endl; - // test(length); - - // std::cout << "================ unsigned int" << std::endl; - // test(length); - // std::cout << "================ unsigned long" << std::endl; - // test(length); - // std::cout << "================ unsigned long long" << std::endl; - // test(length); - - // std::cout << "================ float" << std::endl; - // test(length); - // std::cout << "================ double" << std::endl; - // test(length); - // } - benchmark::Shutdown(); Kokkos::finalize(); return 0; From 90b71cb4cf64ac28a2948894c734f9e87afdfe0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 15 Dec 2022 14:54:02 +0100 Subject: [PATCH 191/496] Use correct license headers --- core/perf_test/Benchmark_Context.cpp | 38 +++------------------- core/perf_test/PerfTest_ViewFill_Raw.cpp | 38 +++------------------- core/perf_test/PerfTest_ViewResize_Raw.cpp | 38 +++------------------- 3 files changed, 15 insertions(+), 99 deletions(-) diff --git a/core/perf_test/Benchmark_Context.cpp b/core/perf_test/Benchmark_Context.cpp index 7cb0079fd9..d859f0aff8 100644 --- a/core/perf_test/Benchmark_Context.cpp +++ b/core/perf_test/Benchmark_Context.cpp @@ -1,46 +1,18 @@ -/* //@HEADER // ************************************************************************ // -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ //@HEADER -*/ #include diff --git a/core/perf_test/PerfTest_ViewFill_Raw.cpp b/core/perf_test/PerfTest_ViewFill_Raw.cpp index d530aa4bb0..f4f4f07a1e 100644 --- a/core/perf_test/PerfTest_ViewFill_Raw.cpp +++ b/core/perf_test/PerfTest_ViewFill_Raw.cpp @@ -1,46 +1,18 @@ -/* //@HEADER // ************************************************************************ // -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ //@HEADER -*/ #include diff --git a/core/perf_test/PerfTest_ViewResize_Raw.cpp b/core/perf_test/PerfTest_ViewResize_Raw.cpp index da4cddc946..6cef390816 100644 --- a/core/perf_test/PerfTest_ViewResize_Raw.cpp +++ b/core/perf_test/PerfTest_ViewResize_Raw.cpp @@ -1,46 +1,18 @@ -/* //@HEADER // ************************************************************************ // -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ //@HEADER -*/ #include From 62b8421c269fd64d032e7634c5908c8b579837f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 22 Dec 2022 17:00:55 +0100 Subject: [PATCH 192/496] Remove duplicated helper --- core/perf_test/PerfTest_ViewCopy.hpp | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/core/perf_test/PerfTest_ViewCopy.hpp b/core/perf_test/PerfTest_ViewCopy.hpp index ea645f91f7..412f794c4c 100644 --- a/core/perf_test/PerfTest_ViewCopy.hpp +++ b/core/perf_test/PerfTest_ViewCopy.hpp @@ -29,31 +29,12 @@ #include #endif +#include "Benchmark_Context.hpp" #include namespace Test { -/** - * \brief Mark the label as a figure of merit. - */ -inline std::string benchmark_fom(const std::string& label) { - return "FOM: " + label; -} - -inline void report_results(benchmark::State& state, std::size_t num_elems, - double time) { - state.SetIterationTime(time); - - // data size in megabytes - const auto size = 1.0 * num_elems * sizeof(double) / 1000 / 1000; - // data processed in gigabytes - const auto data_processed = 2 * size / 1000; - - state.counters["MB"] = - benchmark::Counter(size, benchmark::Counter::kDefaults); - state.counters[benchmark_fom("GB/s")] = benchmark::Counter( - data_processed, benchmark::Counter::kIsIterationInvariantRate); -} +static constexpr int DATA_RATIO = 2; template void deepcopy_view(ViewTypeA& a, ViewTypeB& b, benchmark::State& state) { @@ -61,7 +42,7 @@ void deepcopy_view(ViewTypeA& a, ViewTypeB& b, benchmark::State& state) { Kokkos::fence(); Kokkos::Timer timer; Kokkos::deep_copy(a, b); - report_results(state, a.size(), timer.seconds()); + KokkosBenchmark::report_results(state, a, DATA_RATIO, timer.seconds()); } } @@ -168,7 +149,7 @@ static void ViewDeepCopy_Raw(benchmark::State& state) { Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = b_ptr[i]; }); Kokkos::fence(); - report_results(state, a.size(), timer.seconds()); + KokkosBenchmark::report_results(state, a, DATA_RATIO, timer.seconds()); } } From 36bc91e063da70f0c878664c54cc4b181d476645 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 22 Dec 2022 17:24:25 +0100 Subject: [PATCH 193/496] Port GramSchmidt tests --- core/perf_test/CMakeLists.txt | 3 +- core/perf_test/PerfTestGramSchmidt.cpp | 87 ++++++++------------------ 2 files changed, 27 insertions(+), 63 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 13f083f3d8..46ed70d3fd 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -10,11 +10,9 @@ IF(KOKKOS_ENABLE_TESTS) SET(SOURCES PerfTestMain.cpp - PerfTestGramSchmidt.cpp PerfTest_ExecSpacePartitioning.cpp ) - IF(Kokkos_ENABLE_OPENMPTARGET) # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction LIST(REMOVE_ITEM SOURCES @@ -176,6 +174,7 @@ SET( BENCHMARK_SOURCES BenchmarkMain.cpp Benchmark_Context.cpp + PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp PerfTestHexGrad.cpp PerfTest_ViewAllocate.cpp diff --git a/core/perf_test/PerfTestGramSchmidt.cpp b/core/perf_test/PerfTestGramSchmidt.cpp index c8f8487ffc..b34947968d 100644 --- a/core/perf_test/PerfTestGramSchmidt.cpp +++ b/core/perf_test/PerfTestGramSchmidt.cpp @@ -15,7 +15,7 @@ //@HEADER #include -#include +#include #include #include @@ -137,87 +137,52 @@ struct ModifiedGramSchmidt { //-------------------------------------------------------------------------- - static double test(const size_type length, const size_type count, - const size_t iter = 1) { + static double test(const size_type length, const size_type count) { multivector_type Q_("Q", length, count); multivector_type R_("R", count, count); typename multivector_type::HostMirror A = Kokkos::create_mirror(Q_); // Create and fill A on the host - for (size_type j = 0; j < count; ++j) { for (size_type i = 0; i < length; ++i) { A(i, j) = (i + 1) * (j + 1); } } - double dt_min = 0; - - for (size_t i = 0; i < iter; ++i) { - Kokkos::deep_copy(Q_, A); - - // A = Q * R + Kokkos::deep_copy(Q_, A); - const double dt = factorization(Q_, R_); + // A = Q * R + const double dt = factorization(Q_, R_); - if (0 == i) - dt_min = dt; - else - dt_min = dt < dt_min ? dt : dt_min; - } - - return dt_min; + return dt; } }; -template -void run_test_gramschmidt(int exp_beg, int exp_end, int num_trials, - const char deviceTypeName[]) { +template +static void GramSchmidt(benchmark::State& state) { std::string label_gramschmidt; - label_gramschmidt.append("\"GramSchmidt< double , "); - label_gramschmidt.append(deviceTypeName); - label_gramschmidt.append(" >\""); - - for (int i = exp_beg; i < exp_end; ++i) { - double min_seconds = 0.0; - double max_seconds = 0.0; - double avg_seconds = 0.0; - - const int parallel_work_length = 1 << i; - - for (int j = 0; j < num_trials; ++j) { - const double seconds = ModifiedGramSchmidt::test( - parallel_work_length, 32); - - if (0 == j) { - min_seconds = seconds; - max_seconds = seconds; - } else { - if (seconds < min_seconds) min_seconds = seconds; - if (seconds > max_seconds) max_seconds = seconds; - } - avg_seconds += seconds; - } - avg_seconds /= num_trials; - - std::cout << label_gramschmidt << " , " << parallel_work_length << " , " - << min_seconds << " , " << (min_seconds / parallel_work_length) - << ", " << avg_seconds << std::endl; - } -} -TEST(default_exec, gramschmidt) { - int exp_beg = 10; - int exp_end = 20; - int num_trials = 5; + const int parallel_work_length = state.range(0); - if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1)); - if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2)); - if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3)); + for (auto _ : state) { + const double seconds = + ModifiedGramSchmidt::test( + parallel_work_length, 32); - EXPECT_NO_THROW(run_test_gramschmidt( - exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name())); + state.SetIterationTime(seconds); + state.counters["Count"] = benchmark::Counter(parallel_work_length); + state.counters["Time normalized"] = + benchmark::Counter(seconds / parallel_work_length); + } } +BENCHMARK(GramSchmidt) + ->ArgName("Count") + ->ArgsProduct({ + benchmark::CreateRange(1 << 10, 1 << 19, 2), + }) + ->UseManualTime() + ->Iterations(5); + } // namespace Test From 4da9dd924fed75e48fd0e7e12fde089c1ff77a79 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 6 Feb 2023 19:39:50 +0100 Subject: [PATCH 194/496] Move Kokkos_HPX.hpp header into HPX subdirectory --- core/src/HPX/Kokkos_HPX.cpp | 2 +- core/src/{ => HPX}/Kokkos_HPX.hpp | 0 core/src/HPX/Kokkos_HPX_Task.hpp | 2 +- core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp | 2 +- core/src/decl/Kokkos_Declare_HPX.hpp | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename core/src/{ => HPX}/Kokkos_HPX.hpp (100%) diff --git a/core/src/HPX/Kokkos_HPX.cpp b/core/src/HPX/Kokkos_HPX.cpp index aadac9e8e1..11e11a4573 100644 --- a/core/src/HPX/Kokkos_HPX.cpp +++ b/core/src/HPX/Kokkos_HPX.cpp @@ -21,7 +21,7 @@ #include #ifdef KOKKOS_ENABLE_HPX -#include +#include #include diff --git a/core/src/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp similarity index 100% rename from core/src/Kokkos_HPX.hpp rename to core/src/HPX/Kokkos_HPX.hpp diff --git a/core/src/HPX/Kokkos_HPX_Task.hpp b/core/src/HPX/Kokkos_HPX_Task.hpp index 7c87802948..f7a4f641e4 100644 --- a/core/src/HPX/Kokkos_HPX_Task.hpp +++ b/core/src/HPX/Kokkos_HPX_Task.hpp @@ -22,7 +22,7 @@ #include -#include +#include #include #include diff --git a/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp b/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp index 85072b7700..cb6d5a875e 100644 --- a/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp +++ b/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp @@ -17,7 +17,7 @@ #ifndef KOKKOS_HPX_WORKGRAPHPOLICY_HPP #define KOKKOS_HPX_WORKGRAPHPOLICY_HPP -#include +#include #include #include diff --git a/core/src/decl/Kokkos_Declare_HPX.hpp b/core/src/decl/Kokkos_Declare_HPX.hpp index 73f94591f5..f901236246 100644 --- a/core/src/decl/Kokkos_Declare_HPX.hpp +++ b/core/src/decl/Kokkos_Declare_HPX.hpp @@ -18,7 +18,7 @@ #define KOKKOS_DECLARE_HPX_HPP #if defined(KOKKOS_ENABLE_HPX) -#include +#include #include #endif From e3324b37f1b17da4d66251c4712b872efd3828a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 22 Dec 2022 19:25:35 +0100 Subject: [PATCH 195/496] Port ExecSpacePartitionig tests --- core/perf_test/CMakeLists.txt | 23 +- .../PerfTest_ExecSpacePartitioning.cpp | 997 +++++++++--------- 2 files changed, 518 insertions(+), 502 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 46ed70d3fd..d81bb5a5a8 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -8,20 +8,6 @@ ENDIF() # TODO: finish converting these into benchmarks (in progress) IF(KOKKOS_ENABLE_TESTS) - SET(SOURCES - PerfTestMain.cpp - PerfTest_ExecSpacePartitioning.cpp - ) - - IF(Kokkos_ENABLE_OPENMPTARGET) - # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction - LIST(REMOVE_ITEM SOURCES - PerfTestGramSchmidt.cpp - PerfTest_CustomReduction.cpp - PerfTest_ExecSpacePartitioning.cpp - ) - ENDIF() - IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) KOKKOS_ADD_EXECUTABLE ( PerformanceTest_SharedSpace @@ -58,14 +44,6 @@ IF(KOKKOS_ENABLE_TESTS) CATEGORIES PERFORMANCE ) - IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_Atomic_MinMax - SOURCES test_atomic_minmax_simple.cpp - CATEGORIES PERFORMANCE - ) - ENDIF() - # FIXME_NVHPC IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) KOKKOS_ADD_EXECUTABLE_AND_TEST( @@ -176,6 +154,7 @@ SET( Benchmark_Context.cpp PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp + PerfTest_ExecSpacePartitioning.cpp PerfTestHexGrad.cpp PerfTest_ViewAllocate.cpp PerfTest_ViewCopy_a123.cpp diff --git a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index 2a07dfa286..8fd8310653 100644 --- a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -15,7 +15,7 @@ //@HEADER #include -#include +#include #include namespace Test { @@ -154,14 +154,10 @@ struct FunctorTeamReduce { } }; -TEST(default_exec, overlap_range_policy) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif - - int N = 2000; - int M = 10000; - int R = 10; +static void OverlapRangePolicy(benchmark::State& state) { + int N = state.range(0); + int M = state.range(1); + int R = state.range(2); TEST_EXECSPACE space; std::vector execution_space_instances = @@ -169,164 +165,175 @@ TEST(default_exec, overlap_range_policy) { TEST_EXECSPACE space1 = execution_space_instances[0]; TEST_EXECSPACE space2 = execution_space_instances[1]; - Kokkos::View a("A", N, M); - FunctorRange f(M, R, a); - FunctorRangeReduce fr(M, R, a); - Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0", - Kokkos::RangePolicy(0, N), - FunctorRange(M, R, a)); - - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel1", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space1, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel2", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space2, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::fence(); - - Kokkos::Timer timer; - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel3", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel4", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel5", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space1, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - FunctorRange(M, R, a)); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel6", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space2, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - FunctorRange(M, R, a)); - Kokkos::fence(); - double time_overlap = timer.seconds(); - - timer.reset(); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel7", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel8", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::fence(); - double time_end = timer.seconds(); - - if (is_overlapping(space)) { - ASSERT_GT(time_end, 1.5 * time_overlap); - } - printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end, - time_overlap); - - Kokkos::View result("result"); - Kokkos::View result1("result1"); - Kokkos::View result2("result2"); - Kokkos::View h_result("h_result"); - Kokkos::View h_result1("h_result1"); - Kokkos::View h_result2("h_result2"); - - timer.reset(); - Kokkos::parallel_reduce( - "default_exec::overlap_range_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result); - Kokkos::fence(); - double time_fenced = timer.seconds(); - Kokkos::deep_copy(h_result, result); - - timer.reset(); - Kokkos::parallel_reduce( - "default_exec::overlap_range_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result); - double time_not_fenced = timer.seconds(); - Kokkos::fence(); - if (is_overlapping(space)) { - ASSERT_GT(time_fenced, 2.0 * time_not_fenced); - } + for (auto _ : state) { + Kokkos::View a("A", N, M); + FunctorRange f(M, R, a); + FunctorRangeReduce fr(M, R, a); + Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0", + Kokkos::RangePolicy(0, N), + FunctorRange(M, R, a)); + + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel1", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space1, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel2", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space2, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + + Kokkos::Timer timer; + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel3", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel4", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel5", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space1, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorRange(M, R, a)); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel6", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space2, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorRange(M, R, a)); + Kokkos::fence(); + double time_overlap = timer.seconds(); + + timer.reset(); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel7", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel8", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + double time_end = timer.seconds(); + + if (is_overlapping(space)) { + KOKKOS_ASSERT(time_end > 1.5 * time_overlap); + } + state.counters["Time NonOverlap"] = benchmark::Counter(time_end); + state.counters["Time Overlap"] = benchmark::Counter(time_overlap); + + Kokkos::View result("result"); + Kokkos::View result1("result1"); + Kokkos::View result2("result2"); + Kokkos::View h_result("h_result"); + Kokkos::View h_result1("h_result1"); + Kokkos::View h_result2("h_result2"); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_range_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::fence(); + double time_fenced = timer.seconds(); + Kokkos::deep_copy(h_result, result); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_range_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + double time_not_fenced = timer.seconds(); + Kokkos::fence(); + if (is_overlapping(space)) { + KOKKOS_ASSERT(time_fenced > 2.0 * time_not_fenced); + } - timer.reset(); - Kokkos::parallel_reduce( - "default_exec::overlap_range_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result); - Kokkos::parallel_reduce( - "default_exec::overlap_range_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result); - Kokkos::fence(); - double time_no_overlapped_reduce = timer.seconds(); - - timer.reset(); - Kokkos::parallel_reduce( - "default_exec::overlap_range_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space1, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result1); - Kokkos::parallel_reduce( - "default_exec::overlap_range_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::RangePolicy(space2, 0, N), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result2); - Kokkos::fence(); - double time_overlapped_reduce = timer.seconds(); - - Kokkos::deep_copy(h_result2, result2); - Kokkos::deep_copy(h_result1, result1); - - ASSERT_EQ(h_result1(), h_result()); - ASSERT_EQ(h_result2(), h_result()); - - if (is_overlapping(space)) { - ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce); + state.counters["Time fenced"] = benchmark::Counter(time_fenced); + state.counters["Time not fenced"] = benchmark::Counter(time_not_fenced); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_range_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::parallel_reduce( + "default_exec::overlap_range_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::fence(); + double time_no_overlapped_reduce = timer.seconds(); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_range_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space1, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result1); + Kokkos::parallel_reduce( + "default_exec::overlap_range_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy(space2, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result2); + Kokkos::fence(); + double time_overlapped_reduce = timer.seconds(); + + Kokkos::deep_copy(h_result2, result2); + Kokkos::deep_copy(h_result1, result1); + + KOKKOS_ASSERT(h_result1() == h_result()); + KOKKOS_ASSERT(h_result2() == h_result()); + + if (is_overlapping(space)) { + KOKKOS_ASSERT(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce); + } + + state.counters["Time Reduce: NonOverlap"] = + benchmark::Counter(time_no_overlapped_reduce); + state.counters["Time Reduce: Overlap"] = + benchmark::Counter(time_overlapped_reduce); } - printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n", - time_no_overlapped_reduce, time_overlapped_reduce); } -TEST(default_exec, overlap_mdrange_policy) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; +// skip for SYCL+Cuda +#if !defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ARCH_INTEL_GPU) +BENCHMARK(OverlapRangePolicy) + ->ArgNames({"N", "M", "R"}) + ->Args({2'000, 10'000, 10}); #endif - int N = 200; - int M = 10000; - int R = 10; +static void OverlapMDRangePolicy(benchmark::State& state) { + int N = state.range(0); + int M = state.range(1); + int R = state.range(2); TEST_EXECSPACE space; std::vector execution_space_instances = @@ -334,182 +341,194 @@ TEST(default_exec, overlap_mdrange_policy) { TEST_EXECSPACE space1 = execution_space_instances[0]; TEST_EXECSPACE space2 = execution_space_instances[1]; - Kokkos::View a("A", N, M); - FunctorMDRange f(M, R, a); - FunctorMDRangeReduce fr(M, R, a); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel0", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>({0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - FunctorMDRange(M, R, a)); - - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel1", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space1, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel2", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space2, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::fence(); - - Kokkos::Timer timer; - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel3", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel4", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel5", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space1, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - FunctorMDRange(M, R, a)); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel6", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space2, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - FunctorMDRange(M, R, a)); - Kokkos::fence(); - double time_overlap = timer.seconds(); - - timer.reset(); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel7", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel8", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::fence(); - double time_end = timer.seconds(); - - if (is_overlapping(space)) { - ASSERT_GT(time_end, 1.5 * time_overlap); - } - printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end, - time_overlap); - - Kokkos::View result("result"); - Kokkos::View result1("result1"); - Kokkos::View result2("result2"); - Kokkos::View h_result("h_result"); - Kokkos::View h_result1("h_result1"); - Kokkos::View h_result2("h_result2"); - - timer.reset(); - Kokkos::parallel_reduce( - "default_exec::overlap_mdrange_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result); - Kokkos::fence(); - double time_fenced = timer.seconds(); - Kokkos::deep_copy(h_result, result); - - timer.reset(); - Kokkos::parallel_reduce( - "default_exec::overlap_mdrange_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result); - double time_not_fenced = timer.seconds(); - Kokkos::fence(); - if (is_overlapping(space)) { - ASSERT_GT(time_fenced, 2.0 * time_not_fenced); - } + for (auto _ : state) { + Kokkos::View a("A", N, M); + FunctorMDRange f(M, R, a); + FunctorMDRangeReduce fr(M, R, a); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel0", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>({0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorMDRange(M, R, a)); + + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel1", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space1, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel2", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space2, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + + Kokkos::Timer timer; + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel3", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel4", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel5", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space1, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorMDRange(M, R, a)); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel6", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space2, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorMDRange(M, R, a)); + Kokkos::fence(); + double time_overlap = timer.seconds(); + + timer.reset(); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel7", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel8", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + double time_end = timer.seconds(); + + if (is_overlapping(space)) { + KOKKOS_ASSERT(time_end > 1.5 * time_overlap); + } + + state.counters["Time NonOverlap"] = benchmark::Counter(time_end); + state.counters["Time Overlap"] = benchmark::Counter(time_overlap); + + Kokkos::View result("result"); + Kokkos::View result1("result1"); + Kokkos::View result2("result2"); + Kokkos::View h_result("h_result"); + Kokkos::View h_result1("h_result1"); + Kokkos::View h_result2("h_result2"); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_mdrange_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::fence(); + double time_fenced = timer.seconds(); + Kokkos::deep_copy(h_result, result); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_mdrange_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + double time_not_fenced = timer.seconds(); + Kokkos::fence(); + if (is_overlapping(space)) { + KOKKOS_ASSERT(time_fenced > 2.0 * time_not_fenced); + } - timer.reset(); - Kokkos::parallel_reduce( - "default_exec::overlap_mdrange_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result); - Kokkos::parallel_reduce( - "default_exec::overlap_mdrange_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result); - Kokkos::fence(); - double time_no_overlapped_reduce = timer.seconds(); - - timer.reset(); - Kokkos::parallel_reduce( - "default_exec::overlap_mdrange_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space1, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result1); - Kokkos::parallel_reduce( - "default_exec::overlap_mdrange_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(space2, {0, 0}, - {N, R}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result2); - Kokkos::fence(); - double time_overlapped_reduce = timer.seconds(); - - Kokkos::deep_copy(h_result2, result2); - Kokkos::deep_copy(h_result1, result1); - - ASSERT_EQ(h_result1(), h_result()); - ASSERT_EQ(h_result2(), h_result()); - - if (is_overlapping(space)) { - ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce); + state.counters["Time fenced"] = benchmark::Counter(time_fenced); + state.counters["Time not fenced"] = benchmark::Counter(time_not_fenced); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_mdrange_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::parallel_reduce( + "default_exec::overlap_mdrange_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::fence(); + double time_no_overlapped_reduce = timer.seconds(); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_mdrange_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space1, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result1); + Kokkos::parallel_reduce( + "default_exec::overlap_mdrange_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + space2, {0, 0}, {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result2); + Kokkos::fence(); + double time_overlapped_reduce = timer.seconds(); + + Kokkos::deep_copy(h_result2, result2); + Kokkos::deep_copy(h_result1, result1); + + KOKKOS_ASSERT(h_result1() == h_result()); + KOKKOS_ASSERT(h_result2() == h_result()); + + if (is_overlapping(space)) { + KOKKOS_ASSERT(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce); + } + + state.counters["Time Reduce: NonOverlap"] = + benchmark::Counter(time_no_overlapped_reduce); + state.counters["Time Reduce: Time Overlap"] = + benchmark::Counter(time_overlapped_reduce); } - printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n", - time_no_overlapped_reduce, time_overlapped_reduce); } -TEST(default_exec, overlap_team_policy) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; +// skip for SYCL+Cuda +#if !defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ARCH_INTEL_GPU) +BENCHMARK(OverlapMDRangePolicy) + ->ArgNames({"N", "M", "R"}) + ->Args({200, 10'000, 10}); #endif - int N = 20; - int M = 1000000; - int R = 10; +static void OverlapTeamPolicy(benchmark::State& state) { + int N = state.range(0); + int M = state.range(1); + int R = state.range(2); TEST_EXECSPACE space; std::vector execution_space_instances = @@ -517,155 +536,173 @@ TEST(default_exec, overlap_team_policy) { TEST_EXECSPACE space1 = execution_space_instances[0]; TEST_EXECSPACE space2 = execution_space_instances[1]; - Kokkos::View a("A", N, M); - FunctorTeam f(M, R, a); - FunctorTeamReduce fr(M, R, a); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel0", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - FunctorTeam(M, R, a)); - - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel1", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space1, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel2", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space2, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::fence(); - - Kokkos::Timer timer; - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel3", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel4", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel5", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space1, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - FunctorTeam(M, R, a)); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel6", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space2, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - FunctorTeam(M, R, a)); - Kokkos::fence(); - double time_overlap = timer.seconds(); - - timer.reset(); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel7", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::parallel_for( - "default_exec::overlap_range_policy::kernel8", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - f); - Kokkos::fence(); - double time_end = timer.seconds(); - - if (is_overlapping(space)) { - ASSERT_GT(time_end, 1.5 * time_overlap); - } - printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end, - time_overlap); - - Kokkos::View result("result"); - Kokkos::View result1("result1"); - Kokkos::View result2("result2"); - Kokkos::View h_result("h_result"); - Kokkos::View h_result1("h_result1"); - Kokkos::View h_result2("h_result2"); - - timer.reset(); - Kokkos::parallel_reduce( - "default_exec::overlap_team_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result); - Kokkos::fence(); - double time_fenced = timer.seconds(); - Kokkos::deep_copy(h_result, result); - - timer.reset(); - Kokkos::parallel_reduce( - "default_exec::overlap_team_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result); - double time_not_fenced = timer.seconds(); - Kokkos::fence(); - if (is_overlapping(space)) { - ASSERT_GT(time_fenced, 2.0 * time_not_fenced); - } - timer.reset(); - Kokkos::parallel_reduce( - "default_exec::overlap_team_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result); - Kokkos::parallel_reduce( - "default_exec::overlap_team_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result); - Kokkos::fence(); - double time_no_overlapped_reduce = timer.seconds(); - - timer.reset(); - Kokkos::parallel_reduce( - "default_exec::overlap_team_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space1, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result1); - Kokkos::parallel_reduce( - "default_exec::overlap_team_policy::kernel_reduce", - Kokkos::Experimental::require( - Kokkos::TeamPolicy(space2, N, Kokkos::AUTO), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - fr, result2); - Kokkos::fence(); - double time_overlapped_reduce = timer.seconds(); - - Kokkos::deep_copy(h_result2, result2); - Kokkos::deep_copy(h_result1, result1); - - ASSERT_EQ(h_result1(), h_result()); - ASSERT_EQ(h_result2(), h_result()); - - if (is_overlapping(space)) { - ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce); + for (auto _ : state) { + Kokkos::View a("A", N, M); + FunctorTeam f(M, R, a); + FunctorTeamReduce fr(M, R, a); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel0", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorTeam(M, R, a)); + + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel1", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space1, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel2", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space2, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + + Kokkos::Timer timer; + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel3", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel4", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel5", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space1, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorTeam(M, R, a)); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel6", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space2, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorTeam(M, R, a)); + Kokkos::fence(); + double time_overlap = timer.seconds(); + + timer.reset(); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel7", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel8", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + double time_end = timer.seconds(); + + if (is_overlapping(space)) { + KOKKOS_ASSERT(time_end > 1.5 * time_overlap); + } + + state.counters["Time NonOverlap"] = benchmark::Counter(time_end); + state.counters["Time Overlap"] = benchmark::Counter(time_overlap); + + Kokkos::View result("result"); + Kokkos::View result1("result1"); + Kokkos::View result2("result2"); + Kokkos::View h_result("h_result"); + Kokkos::View h_result1("h_result1"); + Kokkos::View h_result2("h_result2"); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_team_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::fence(); + double time_fenced = timer.seconds(); + Kokkos::deep_copy(h_result, result); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_team_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + double time_not_fenced = timer.seconds(); + Kokkos::fence(); + if (is_overlapping(space)) { + KOKKOS_ASSERT(time_fenced > 2.0 * time_not_fenced); + } + + state.counters["Time fenced"] = benchmark::Counter(time_fenced); + state.counters["Time not fenced"] = benchmark::Counter(time_not_fenced); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_team_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::parallel_reduce( + "default_exec::overlap_team_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::fence(); + double time_no_overlapped_reduce = timer.seconds(); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_team_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space1, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result1); + Kokkos::parallel_reduce( + "default_exec::overlap_team_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::TeamPolicy(space2, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result2); + Kokkos::fence(); + double time_overlapped_reduce = timer.seconds(); + + Kokkos::deep_copy(h_result2, result2); + Kokkos::deep_copy(h_result1, result1); + + KOKKOS_ASSERT(h_result1() == h_result()); + KOKKOS_ASSERT(h_result2() == h_result()); + + if (is_overlapping(space)) { + KOKKOS_ASSERT(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce); + } + + state.counters["Time Reduce: NonOverlap"] = + benchmark::Counter(time_no_overlapped_reduce); + state.counters["Time Reduce: Time Overlap"] = + benchmark::Counter(time_overlapped_reduce); } - printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n", - time_no_overlapped_reduce, time_overlapped_reduce); } + +// skip for SYCL+Cuda +#if !defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ARCH_INTEL_GPU) +BENCHMARK(OverlapTeamPolicy) + ->ArgNames({"N", "M", "R"}) + ->Args({20, 1'000'000, 10}); +#endif + } // namespace Test From ab55654ae07561a3a1e17163e602eef06925c8c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 22 Dec 2022 22:09:13 +0100 Subject: [PATCH 196/496] Disable unsupported benchmarks in OpenMPTarget --- core/perf_test/CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index d81bb5a5a8..7bd93fdb43 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -192,6 +192,15 @@ SET( PerfTest_ViewResize_Raw.cpp ) +IF(Kokkos_ENABLE_OPENMPTARGET) +# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction + LIST(REMOVE_ITEM BENCHMARK_SOURCES + PerfTestGramSchmidt.cpp + PerfTest_CustomReduction.cpp + PerfTest_ExecSpacePartitioning.cpp + ) +ENDIF() + KOKKOS_ADD_BENCHMARK( PerformanceTest_Benchmark SOURCES ${BENCHMARK_SOURCES} From aa20b2b31ddedf71d94da51b46e4494df9dcc7ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 29 Dec 2022 22:16:02 +0100 Subject: [PATCH 197/496] Avoid multiple `main()` definitions --- core/perf_test/BenchmarkMain.cpp | 5 +++++ core/perf_test/CMakeLists.txt | 1 + core/perf_test/test_atomic_minmax_simple.cpp | 17 ----------------- 3 files changed, 6 insertions(+), 17 deletions(-) diff --git a/core/perf_test/BenchmarkMain.cpp b/core/perf_test/BenchmarkMain.cpp index ac252b23a7..2a9776c3a1 100644 --- a/core/perf_test/BenchmarkMain.cpp +++ b/core/perf_test/BenchmarkMain.cpp @@ -27,12 +27,17 @@ #include "Benchmark_Context.hpp" #include +#include "PerfTest_Category.hpp" + int main(int argc, char** argv) { Kokkos::initialize(argc, argv); benchmark::Initialize(&argc, argv); benchmark::SetDefaultTimeUnit(benchmark::kSecond); KokkosBenchmark::add_benchmark_context(true); + (void)Test::command_line_num_args(argc); + (void)Test::command_line_arg(0, argv); + benchmark::RunSpecifiedBenchmarks(); benchmark::Shutdown(); diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 7bd93fdb43..0606d67a00 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -210,6 +210,7 @@ IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) KOKKOS_ADD_BENCHMARK( Benchmark_Atomic_MinMax SOURCES + BenchmarkMain.cpp Benchmark_Context.cpp test_atomic_minmax_simple.cpp ) diff --git a/core/perf_test/test_atomic_minmax_simple.cpp b/core/perf_test/test_atomic_minmax_simple.cpp index f6b9ce13ff..a8a8f3f975 100644 --- a/core/perf_test/test_atomic_minmax_simple.cpp +++ b/core/perf_test/test_atomic_minmax_simple.cpp @@ -554,20 +554,3 @@ BENCHMARK(Atomic_ContentiousMinReplacements) ->Arg(LENGTH / 5) ->UseManualTime() ->Iterations(10); -/////////////////////////////////////////////////////////////////////// - -int main(int argc, char* argv[]) { - Kokkos::initialize(argc, argv); - benchmark::Initialize(&argc, argv); - benchmark::SetDefaultTimeUnit(benchmark::kSecond); - KokkosBenchmark::add_benchmark_context(true); - - (void)Test::command_line_num_args(argc); - (void)Test::command_line_arg(0, argv); - - benchmark::RunSpecifiedBenchmarks(); - - benchmark::Shutdown(); - Kokkos::finalize(); - return 0; -} From 1b9a67fd3ea17d3f228f64e470054a71a12b72b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 29 Dec 2022 22:41:57 +0100 Subject: [PATCH 198/496] Port Mempool performance test --- core/perf_test/CMakeLists.txt | 29 ++---- core/perf_test/Makefile | 6 -- core/perf_test/test_mempool.cpp | 171 +++++++++++++------------------- 3 files changed, 78 insertions(+), 128 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 0606d67a00..873158e048 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -29,30 +29,12 @@ IF(KOKKOS_ENABLE_TESTS) KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) - # This test currently times out for MSVC - IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerfTestExec - SOURCES ${SOURCES} - CATEGORIES PERFORMANCE - ) - ENDIF() - KOKKOS_ADD_EXECUTABLE_AND_TEST( PerformanceTest_Atomic SOURCES test_atomic.cpp CATEGORIES PERFORMANCE ) - # FIXME_NVHPC - IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_Mempool - SOURCES test_mempool.cpp - CATEGORIES PERFORMANCE - ) - ENDIF() - IF(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME OPENMPTARGET needs tasking KOKKOS_ADD_EXECUTABLE_AND_TEST( @@ -215,3 +197,14 @@ IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) test_atomic_minmax_simple.cpp ) ENDIF() + +# FIXME_NVHPC +IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + KOKKOS_ADD_BENCHMARK( + PerformanceTest_Mempool + SOURCES + BenchmarkMain.cpp + Benchmark_Context.cpp + test_mempool.cpp + ) +ENDIF() diff --git a/core/perf_test/Makefile b/core/perf_test/Makefile index 8d574a40b7..6230845b28 100644 --- a/core/perf_test/Makefile +++ b/core/perf_test/Makefile @@ -35,12 +35,6 @@ TEST_TARGETS += test-atomic # -OBJ_MEMPOOL = test_mempool.o -TARGETS += KokkosCore_PerformanceTest_Mempool -TEST_TARGETS += test-mempool - -# - OBJ_TASKDAG = test_taskdag.o TARGETS += KokkosCore_PerformanceTest_TaskDAG TEST_TARGETS += test-taskdag diff --git a/core/perf_test/test_mempool.cpp b/core/perf_test/test_mempool.cpp index e4e1b4c9a3..abb6180346 100644 --- a/core/perf_test/test_mempool.cpp +++ b/core/perf_test/test_mempool.cpp @@ -19,9 +19,13 @@ #include #include +#include #include #include +#include "Benchmark_Context.hpp" +#include "PerfTest_Category.hpp" + using ExecSpace = Kokkos::DefaultExecutionSpace; using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space; @@ -146,53 +150,8 @@ struct TestFunctor { } }; -int main(int argc, char* argv[]) { - static const char help_flag[] = "--help"; - static const char alloc_size_flag[] = "--alloc_size="; - static const char super_size_flag[] = "--super_size="; - static const char chunk_span_flag[] = "--chunk_span="; - static const char fill_stride_flag[] = "--fill_stride="; - static const char fill_level_flag[] = "--fill_level="; - static const char repeat_outer_flag[] = "--repeat_outer="; - static const char repeat_inner_flag[] = "--repeat_inner="; - - long total_alloc_size = 1000000; - int min_superblock_size = 10000; - int chunk_span = 5; - int fill_stride = 1; - int fill_level = 70; - int repeat_outer = 1; - int repeat_inner = 1; - - int ask_help = 0; - - for (int i = 1; i < argc; i++) { - const char* const a = argv[i]; - - if (!strncmp(a, help_flag, strlen(help_flag))) ask_help = 1; - - if (!strncmp(a, alloc_size_flag, strlen(alloc_size_flag))) - total_alloc_size = atol(a + strlen(alloc_size_flag)); - - if (!strncmp(a, super_size_flag, strlen(super_size_flag))) - min_superblock_size = std::stoi(a + strlen(super_size_flag)); - - if (!strncmp(a, fill_stride_flag, strlen(fill_stride_flag))) - fill_stride = std::stoi(a + strlen(fill_stride_flag)); - - if (!strncmp(a, fill_level_flag, strlen(fill_level_flag))) - fill_level = std::stoi(a + strlen(fill_level_flag)); - - if (!strncmp(a, chunk_span_flag, strlen(chunk_span_flag))) - chunk_span = std::stoi(a + strlen(chunk_span_flag)); - - if (!strncmp(a, repeat_outer_flag, strlen(repeat_outer_flag))) - repeat_outer = std::stoi(a + strlen(repeat_outer_flag)); - - if (!strncmp(a, repeat_inner_flag, strlen(repeat_inner_flag))) - repeat_inner = std::stoi(a + strlen(repeat_inner_flag)); - } - +int get_number_alloc(int chunk_span, int min_superblock_size, + long total_alloc_size, int fill_level) { int chunk_span_bytes = 0; for (int i = 0; i < chunk_span; ++i) { auto chunk_bytes = TestFunctor::chunk * (1 + i); @@ -212,81 +171,85 @@ int main(int argc, char* argv[]) { auto bytes_wanted = (actual_total_bytes * fill_level) / 100; auto chunk_spans = bytes_wanted / chunk_span_bytes; auto number_alloc = int(chunk_spans * chunk_span); + return number_alloc; +} + +template +T get_parameter(const char flag[], T default_value) { + auto argc = Test::command_line_num_args(); + auto value = default_value; + + for (int i = 1; i < argc; i++) { + const char* const a = Test::command_line_arg(i); - if (ask_help) { - std::cout << "command line options:" - << " " << help_flag << " " << alloc_size_flag << "##" - << " " << super_size_flag << "##" - << " " << fill_stride_flag << "##" - << " " << fill_level_flag << "##" - << " " << chunk_span_flag << "##" - << " " << repeat_outer_flag << "##" - << " " << repeat_inner_flag << "##" << std::endl; - return 0; + if (!strncmp(a, flag, strlen(flag))) value = std::stoi(a + strlen(flag)); } - Kokkos::initialize(argc, argv); + return value; +} - double sum_fill_time = 0; - double sum_cycle_time = 0; - double sum_both_time = 0; - double min_fill_time = std::numeric_limits::max(); - double min_cycle_time = std::numeric_limits::max(); - double min_both_time = std::numeric_limits::max(); - // one alloc in fill, alloc/dealloc pair in repeat_inner - for (int i = 0; i < repeat_outer; ++i) { +static void Mempool_Fill(benchmark::State& state) { + long total_alloc_size = + get_parameter("--alloc_size=", static_cast(state.range(0))); + int min_superblock_size = get_parameter("--super_size=", state.range(1)); + int chunk_span = get_parameter("--chunk_span=", state.range(2)); + int fill_stride = get_parameter("--fill_stride=", state.range(3)); + int fill_level = get_parameter("--fill_level=", state.range(4)); + int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); + int number_alloc = get_number_alloc(chunk_span, min_superblock_size, + total_alloc_size, fill_level); + + for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, fill_stride, chunk_span, repeat_inner); - Kokkos::Timer timer; if (!functor.test_fill()) { Kokkos::abort("fill "); } - auto t0 = timer.seconds(); + state.SetIterationTime(timer.seconds()); + state.counters[KokkosBenchmark::benchmark_fom("fill ops per second")] = + benchmark::Counter(number_alloc, + benchmark::Counter::kIsIterationInvariantRate); + } +} + +static void Mempool_Alloc_Dealloc(benchmark::State& state) { + long total_alloc_size = + get_parameter("--alloc_size=", static_cast(state.range(0))); + int min_superblock_size = get_parameter("--super_size=", state.range(1)); + int chunk_span = get_parameter("--chunk_span=", state.range(2)); + int fill_stride = get_parameter("--fill_stride=", state.range(3)); + int fill_level = get_parameter("--fill_level=", state.range(4)); + int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); + int number_alloc = get_number_alloc(chunk_span, min_superblock_size, + total_alloc_size, fill_level); + + for (auto _ : state) { + TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, + fill_stride, chunk_span, repeat_inner); + Kokkos::Timer timer; if (!functor.test_alloc_dealloc()) { Kokkos::abort("alloc/dealloc "); } - auto t1 = timer.seconds(); - auto this_fill_time = t0; - auto this_cycle_time = t1 - t0; - auto this_both_time = t1; - sum_fill_time += this_fill_time; - sum_cycle_time += this_cycle_time; - sum_both_time += this_both_time; - min_fill_time = std::min(min_fill_time, this_fill_time); - min_cycle_time = std::min(min_cycle_time, this_cycle_time); - min_both_time = std::min(min_both_time, this_both_time); + state.SetIterationTime(timer.seconds()); + state.counters[KokkosBenchmark::benchmark_fom("cycle ops per second")] = + benchmark::Counter(2 * number_alloc * repeat_inner, + benchmark::Counter::kIsIterationInvariantRate); } +} - Kokkos::finalize(); - - printf( - "\"mempool: alloc super stride level span inner outer number\" %ld %d %d " - "%d %d %d %d %d\n", - total_alloc_size, min_superblock_size, fill_stride, fill_level, - chunk_span, repeat_inner, repeat_outer, number_alloc); - - auto avg_fill_time = sum_fill_time / repeat_outer; - auto avg_cycle_time = sum_cycle_time / repeat_outer; - auto avg_both_time = sum_both_time / repeat_outer; - - printf("\"mempool: fill time (min, avg)\" %.8f %.8f\n", min_fill_time, - avg_fill_time); - - printf("\"mempool: cycle time (min, avg)\" %.8f %.8f\n", min_cycle_time, - avg_cycle_time); - - printf("\"mempool: test time (min, avg)\" %.8f %.8f\n", min_both_time, - avg_both_time); +const std::vector ARG_NAMES = { + "total_alloc_size", "min_superblock_size", "chunk_span", + "fill_stride", "fill_level", "repeat_inner"}; +const std::vector ARGS = {1'000'000, 10'000, 5, 1, 70, 1}; - printf("\"mempool: fill ops per second (max, avg)\" %g %g\n", - number_alloc / min_fill_time, number_alloc / avg_fill_time); +BENCHMARK(Mempool_Fill)->ArgNames(ARG_NAMES)->Args(ARGS)->UseManualTime(); - printf("\"mempool: cycle ops per second (max, avg)\" %g %g\n", - (2 * number_alloc * repeat_inner) / min_cycle_time, - (2 * number_alloc * repeat_inner) / avg_cycle_time); -} +BENCHMARK(Mempool_Alloc_Dealloc) + ->ArgNames(ARG_NAMES) + ->Args(ARGS) + ->UseManualTime(); From 6ab27910522f33370b710ef63b9f93e98141e996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 23 Dec 2022 15:39:41 +0100 Subject: [PATCH 199/496] Clean up perf_test CMakeLists - indent google-benchmark's CMake output - prefer `target_include_directories` - mark benchmark headers as system headers --- core/perf_test/CMakeLists.txt | 41 +++++++++++------------------------ 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 873158e048..b04868bea2 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -7,7 +7,6 @@ ENDIF() # all PerformanceTest_* executables are part of regular tests # TODO: finish converting these into benchmarks (in progress) IF(KOKKOS_ENABLE_TESTS) - IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) KOKKOS_ADD_EXECUTABLE ( PerformanceTest_SharedSpace @@ -15,18 +14,6 @@ IF(KOKKOS_ENABLE_TESTS) ) ENDIF() - # Per #374, we always want to build this test, but we only want to run - # it as a PERFORMANCE test. That's why we separate building the test - # from running the test. - - #leave these as basic includes for now - #I don't need anything transitive - # warning: PerfTest_CustomReduction.cpp uses - # ../../algorithms/src/Kokkos_Random.hpp - # we'll just allow it to be included, but note - # that in TriBITS KokkosAlgorithms can be disabled... - KOKKOS_INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src") - KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) KOKKOS_ADD_EXECUTABLE_AND_TEST( @@ -43,10 +30,8 @@ IF(KOKKOS_ENABLE_TESTS) CATEGORIES PERFORMANCE ) ENDIF() - ENDIF() -# Find or download google/benchmark library IF(NOT Kokkos_ENABLE_BENCHMARKS) RETURN() ENDIF() @@ -55,6 +40,7 @@ IF (KOKKOS_HAS_TRILINOS) message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") ENDIF() +# Find or download google/benchmark library find_package(benchmark QUIET) IF(benchmark_FOUND) MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") @@ -63,7 +49,7 @@ ELSE() include(FetchContent) SET(BENCHMARK_ENABLE_TESTING OFF) - list(APPEND CMAKE_MESSAGE_INDENT " ") + list(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ") FetchContent_Declare( googlebenchmark URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz @@ -72,8 +58,6 @@ ELSE() FetchContent_MakeAvailable(googlebenchmark) list(POP_BACK CMAKE_MESSAGE_INDENT) - include_directories(${benchmark_SOURCE_DIR}/include) - # Suppress clang-tidy diagnostics on code that we do not have control over IF(CMAKE_CXX_CLANG_TIDY) SET_TARGET_PROPERTIES(benchmark PROPERTIES CXX_CLANG_TIDY "") @@ -101,6 +85,10 @@ FUNCTION(KOKKOS_ADD_BENCHMARK NAME) ENDIF() SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME}) + LIST(APPEND BENCHMARK_SOURCES + BenchmarkMain.cpp + Benchmark_Context.cpp + ) ADD_EXECUTABLE( ${BENCHMARK_NAME} @@ -110,6 +98,11 @@ FUNCTION(KOKKOS_ADD_BENCHMARK NAME) ${BENCHMARK_NAME} PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version ) + TARGET_INCLUDE_DIRECTORIES( + ${BENCHMARK_NAME} + SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include + ) + FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) SET_SOURCE_FILES_PROPERTIES( ${SOURCE_FILE} @@ -132,8 +125,6 @@ ENDFUNCTION() SET( BENCHMARK_SOURCES - BenchmarkMain.cpp - Benchmark_Context.cpp PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp @@ -191,10 +182,7 @@ KOKKOS_ADD_BENCHMARK( IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) KOKKOS_ADD_BENCHMARK( Benchmark_Atomic_MinMax - SOURCES - BenchmarkMain.cpp - Benchmark_Context.cpp - test_atomic_minmax_simple.cpp + SOURCES test_atomic_minmax_simple.cpp ) ENDIF() @@ -202,9 +190,6 @@ ENDIF() IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) KOKKOS_ADD_BENCHMARK( PerformanceTest_Mempool - SOURCES - BenchmarkMain.cpp - Benchmark_Context.cpp - test_mempool.cpp + SOURCES test_mempool.cpp ) ENDIF() From eb18f1d36a1b13b04515b762ed8511fd290b3728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 8 Feb 2023 14:42:20 +0100 Subject: [PATCH 200/496] Port Atomic tests --- core/perf_test/CMakeLists.txt | 11 ++- core/perf_test/Makefile | 6 -- core/perf_test/test_atomic.cpp | 140 ++++++++------------------------- 3 files changed, 39 insertions(+), 118 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index b04868bea2..66319f43f5 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -16,12 +16,6 @@ IF(KOKKOS_ENABLE_TESTS) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_Atomic - SOURCES test_atomic.cpp - CATEGORIES PERFORMANCE - ) - IF(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME OPENMPTARGET needs tasking KOKKOS_ADD_EXECUTABLE_AND_TEST( @@ -193,3 +187,8 @@ IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) SOURCES test_mempool.cpp ) ENDIF() + +KOKKOS_ADD_BENCHMARK( + PerformanceTest_Atomic + SOURCES test_atomic.cpp +) diff --git a/core/perf_test/Makefile b/core/perf_test/Makefile index 6230845b28..5e1e0f6541 100644 --- a/core/perf_test/Makefile +++ b/core/perf_test/Makefile @@ -29,12 +29,6 @@ TARGETS = # -OBJ_ATOMICS = test_atomic.o -TARGETS += KokkosCore_PerformanceTest_Atomics -TEST_TARGETS += test-atomic - -# - OBJ_TASKDAG = test_taskdag.o TARGETS += KokkosCore_PerformanceTest_TaskDAG TEST_TARGETS += test-taskdag diff --git a/core/perf_test/test_atomic.cpp b/core/perf_test/test_atomic.cpp index 5f10afc45a..ce3059f47d 100644 --- a/core/perf_test/test_atomic.cpp +++ b/core/perf_test/test_atomic.cpp @@ -18,38 +18,14 @@ #include #include +#include +#include "Benchmark_Context.hpp" + #include #include using exec_space = Kokkos::DefaultExecutionSpace; -#define RESET 0 -#define BRIGHT 1 -#define DIM 2 -#define UNDERLINE 3 -#define BLINK 4 -#define REVERSE 7 -#define HIDDEN 8 - -#define BLACK 0 -#define RED 1 -#define GREEN 2 -#define YELLOW 3 -#define BLUE 4 -#define MAGENTA 5 -#define CYAN 6 -#define GREY 7 -#define WHITE 8 - -void textcolor(int attr, int fg, int bg) { - char command[40]; - - /* Command is the control command to the terminal */ - snprintf(command, 40, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40); - printf("%s", command); -} -void textcolor_standard() { textcolor(RESET, BLACK, WHITE); } - template struct ZeroFunctor { using execution_space = DEVICE_TYPE; @@ -370,7 +346,9 @@ T LoopVariantNonAtomic(int loop, int test) { } template -void Loop(int loop, int test, const char* type_name) { +void Loop(benchmark::State& state, int test) { + int loop = state.range(0); + LoopVariant(loop, test); Kokkos::Timer timer; @@ -388,86 +366,36 @@ void Loop(int loop, int test, const char* type_name) { time *= 1e6 / loop; timeNonAtomic *= 1e6 / loop; timeSerial *= 1e6 / loop; - // textcolor_standard(); - bool passed = true; - if (resSerial != res) passed = false; - // if(!passed) textcolor(RESET,BLACK,YELLOW); - printf( - "%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e " - "%7.4e Size of Type %i)", - type_name, test, passed ? "PASSED" : "FAILED", loop, 1.0 * resSerial, - 1.0 * res, 1.0 * resNonAtomic, timeSerial, time, timeNonAtomic, - (int)sizeof(T)); - // if(!passed) textcolor_standard(); - printf("\n"); -} -template -void Test(int loop, int test, const char* type_name) { - if (test == -1) { - Loop(loop, 1, type_name); - Loop(loop, 2, type_name); - Loop(loop, 3, type_name); - - } else - Loop(loop, test, type_name); -} + bool passed = (resSerial == res); -int main(int argc, char* argv[]) { - int type = -1; - int loop = 100000; - int test = -1; - - for (int i = 0; i < argc; i++) { - if ((strcmp(argv[i], "--test") == 0)) { - test = std::stoi(argv[++i]); - continue; - } - if ((strcmp(argv[i], "--type") == 0)) { - type = std::stoi(argv[++i]); - continue; - } - if ((strcmp(argv[i], "-l") == 0) || (strcmp(argv[i], "--loop") == 0)) { - loop = std::stoi(argv[++i]); - continue; - } - } + state.counters["Passed"] = benchmark::Counter(passed); + state.counters["Value serial"] = benchmark::Counter(resSerial); + state.counters["Value atomic"] = benchmark::Counter(res); + state.counters["Value non-atomic"] = benchmark::Counter(resNonAtomic); + state.counters["Time serial"] = benchmark::Counter(timeSerial); + state.counters["Time atomic"] = benchmark::Counter(time); + state.counters["Time non-atomic"] = benchmark::Counter(timeNonAtomic); + state.counters["Size of type"] = benchmark::Counter(sizeof(T)); +} - Kokkos::initialize(argc, argv); - - printf("Using %s\n", Kokkos::atomic_query_version()); - bool all_tests = false; - if (type == -1) all_tests = true; - while (type < 100) { - if (type == 1) { - Test(loop, test, "int "); - } - if (type == 2) { - Test(loop, test, "long int "); - } - if (type == 3) { - Test(loop, test, "long long int "); - } - if (type == 4) { - Test(loop, test, "unsigned int "); - } - if (type == 5) { - Test(loop, test, "unsigned long int "); - } - if (type == 6) { - Test(loop, test, "unsigned long long int "); - } - if (type == 10) { - // Test(loop,test,"float "); - } - if (type == 11) { - Test(loop, test, "double "); - } - if (!all_tests) - type = 100; - else - type++; +template +static void Test_Atomic(benchmark::State& state) { + for (auto _ : state) { + Loop(state, 1); + Loop(state, 2); + Loop(state, 3); } - - Kokkos::finalize(); } + +static constexpr int LOOP = 100'000; + +BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); +BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); +BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); +BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); +BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); +BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); +BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); +BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); +BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); From b4bd01d48c5db57bc8708bcd174ce3c571c1111b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 13 Jan 2023 19:46:39 +0100 Subject: [PATCH 201/496] Use double quotes instead of include --- core/perf_test/Benchmark_Context.cpp | 2 +- core/perf_test/PerfTestGramSchmidt.cpp | 4 ++-- core/perf_test/PerfTestHexGrad.cpp | 4 ++-- core/perf_test/PerfTest_CustomReduction.cpp | 2 +- core/perf_test/PerfTest_ExecSpacePartitioning.cpp | 2 +- core/perf_test/PerfTest_ViewAllocate.cpp | 2 +- core/perf_test/PerfTest_ViewFill.hpp | 2 +- core/perf_test/PerfTest_ViewFill_123.cpp | 2 +- core/perf_test/PerfTest_ViewFill_45.cpp | 2 +- core/perf_test/PerfTest_ViewFill_6.cpp | 2 +- core/perf_test/PerfTest_ViewFill_7.cpp | 2 +- core/perf_test/PerfTest_ViewFill_8.cpp | 2 +- core/perf_test/PerfTest_ViewFill_Raw.cpp | 2 +- core/perf_test/PerfTest_ViewResize.hpp | 2 +- core/perf_test/PerfTest_ViewResize_123.cpp | 2 +- core/perf_test/PerfTest_ViewResize_45.cpp | 2 +- core/perf_test/PerfTest_ViewResize_6.cpp | 2 +- core/perf_test/PerfTest_ViewResize_7.cpp | 2 +- core/perf_test/PerfTest_ViewResize_8.cpp | 2 +- core/perf_test/PerfTest_ViewResize_Raw.cpp | 2 +- core/perf_test/test_atomic_minmax_simple.cpp | 4 ++-- 21 files changed, 24 insertions(+), 24 deletions(-) diff --git a/core/perf_test/Benchmark_Context.cpp b/core/perf_test/Benchmark_Context.cpp index d859f0aff8..a9652d1525 100644 --- a/core/perf_test/Benchmark_Context.cpp +++ b/core/perf_test/Benchmark_Context.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "Benchmark_Context.hpp" namespace KokkosBenchmark { diff --git a/core/perf_test/PerfTestGramSchmidt.cpp b/core/perf_test/PerfTestGramSchmidt.cpp index b34947968d..c6c47909d0 100644 --- a/core/perf_test/PerfTestGramSchmidt.cpp +++ b/core/perf_test/PerfTestGramSchmidt.cpp @@ -16,10 +16,10 @@ #include #include -#include +#include "PerfTest_Category.hpp" #include -#include +#include "PerfTestBlasKernels.hpp" //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/core/perf_test/PerfTestHexGrad.cpp b/core/perf_test/PerfTestHexGrad.cpp index e1d89dd9dd..98cb246c71 100644 --- a/core/perf_test/PerfTestHexGrad.cpp +++ b/core/perf_test/PerfTestHexGrad.cpp @@ -16,8 +16,8 @@ #include #include -#include -#include +#include "Benchmark_Context.hpp" +#include "PerfTest_Category.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_CustomReduction.cpp b/core/perf_test/PerfTest_CustomReduction.cpp index 2fdab006e9..2110f38a91 100644 --- a/core/perf_test/PerfTest_CustomReduction.cpp +++ b/core/perf_test/PerfTest_CustomReduction.cpp @@ -17,7 +17,7 @@ #include #include #include "Benchmark_Context.hpp" -#include +#include "PerfTest_Category.hpp" #include #include diff --git a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index 8fd8310653..54d2ee0038 100644 --- a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include "PerfTest_Category.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewAllocate.cpp b/core/perf_test/PerfTest_ViewAllocate.cpp index bffcca17aa..63f1d6b2c7 100644 --- a/core/perf_test/PerfTest_ViewAllocate.cpp +++ b/core/perf_test/PerfTest_ViewAllocate.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include "Benchmark_Context.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewFill.hpp b/core/perf_test/PerfTest_ViewFill.hpp index 9ac0be467e..cc5eed85f7 100644 --- a/core/perf_test/PerfTest_ViewFill.hpp +++ b/core/perf_test/PerfTest_ViewFill.hpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "Benchmark_Context.hpp" #include diff --git a/core/perf_test/PerfTest_ViewFill_123.cpp b/core/perf_test/PerfTest_ViewFill_123.cpp index d04f1fbea2..b95b5279a1 100644 --- a/core/perf_test/PerfTest_ViewFill_123.cpp +++ b/core/perf_test/PerfTest_ViewFill_123.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewFill.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewFill_45.cpp b/core/perf_test/PerfTest_ViewFill_45.cpp index 90e597cc11..6a5acfb0d6 100644 --- a/core/perf_test/PerfTest_ViewFill_45.cpp +++ b/core/perf_test/PerfTest_ViewFill_45.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewFill.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewFill_6.cpp b/core/perf_test/PerfTest_ViewFill_6.cpp index b99e5a4eb3..dca20c70df 100644 --- a/core/perf_test/PerfTest_ViewFill_6.cpp +++ b/core/perf_test/PerfTest_ViewFill_6.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewFill.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewFill_7.cpp b/core/perf_test/PerfTest_ViewFill_7.cpp index 99dacfdbb2..6fa8a418c6 100644 --- a/core/perf_test/PerfTest_ViewFill_7.cpp +++ b/core/perf_test/PerfTest_ViewFill_7.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewFill.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewFill_8.cpp b/core/perf_test/PerfTest_ViewFill_8.cpp index 0093e50100..954b097d83 100644 --- a/core/perf_test/PerfTest_ViewFill_8.cpp +++ b/core/perf_test/PerfTest_ViewFill_8.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewFill.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewFill_Raw.cpp b/core/perf_test/PerfTest_ViewFill_Raw.cpp index f4f4f07a1e..c11074d915 100644 --- a/core/perf_test/PerfTest_ViewFill_Raw.cpp +++ b/core/perf_test/PerfTest_ViewFill_Raw.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewFill.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewResize.hpp b/core/perf_test/PerfTest_ViewResize.hpp index 4f6a316803..de6981e17a 100644 --- a/core/perf_test/PerfTest_ViewResize.hpp +++ b/core/perf_test/PerfTest_ViewResize.hpp @@ -17,7 +17,7 @@ #include #include #include -#include +#include "Benchmark_Context.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewResize_123.cpp b/core/perf_test/PerfTest_ViewResize_123.cpp index 0f5fceb5aa..0b3141eead 100644 --- a/core/perf_test/PerfTest_ViewResize_123.cpp +++ b/core/perf_test/PerfTest_ViewResize_123.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewResize.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewResize_45.cpp b/core/perf_test/PerfTest_ViewResize_45.cpp index aba09a5d21..f5eec387cb 100644 --- a/core/perf_test/PerfTest_ViewResize_45.cpp +++ b/core/perf_test/PerfTest_ViewResize_45.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewResize.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewResize_6.cpp b/core/perf_test/PerfTest_ViewResize_6.cpp index 7d14315d84..6b639d3a67 100644 --- a/core/perf_test/PerfTest_ViewResize_6.cpp +++ b/core/perf_test/PerfTest_ViewResize_6.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewResize.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewResize_7.cpp b/core/perf_test/PerfTest_ViewResize_7.cpp index 27dc9f1598..8ebf80e3ff 100644 --- a/core/perf_test/PerfTest_ViewResize_7.cpp +++ b/core/perf_test/PerfTest_ViewResize_7.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewResize.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewResize_8.cpp b/core/perf_test/PerfTest_ViewResize_8.cpp index 3e2d610cdd..5e741e800b 100644 --- a/core/perf_test/PerfTest_ViewResize_8.cpp +++ b/core/perf_test/PerfTest_ViewResize_8.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewResize.hpp" namespace Test { diff --git a/core/perf_test/PerfTest_ViewResize_Raw.cpp b/core/perf_test/PerfTest_ViewResize_Raw.cpp index 6cef390816..2d1bcbb3ca 100644 --- a/core/perf_test/PerfTest_ViewResize_Raw.cpp +++ b/core/perf_test/PerfTest_ViewResize_Raw.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "PerfTest_ViewResize.hpp" namespace Test { diff --git a/core/perf_test/test_atomic_minmax_simple.cpp b/core/perf_test/test_atomic_minmax_simple.cpp index a8a8f3f975..b838c8eccf 100644 --- a/core/perf_test/test_atomic_minmax_simple.cpp +++ b/core/perf_test/test_atomic_minmax_simple.cpp @@ -23,8 +23,8 @@ #include -#include -#include +#include "Benchmark_Context.hpp" +#include "PerfTest_Category.hpp" #include using exec_space = Kokkos::DefaultExecutionSpace; From 204b0854784cce685bead9bc32e65cdd93f8a6e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 7 Feb 2023 12:40:53 +0100 Subject: [PATCH 202/496] Remove obsolete warning pragmas --- core/perf_test/BenchmarkMain.cpp | 8 -------- core/perf_test/Benchmark_Context.hpp | 8 -------- core/perf_test/PerfTest_ViewCopy.hpp | 8 -------- 3 files changed, 24 deletions(-) diff --git a/core/perf_test/BenchmarkMain.cpp b/core/perf_test/BenchmarkMain.cpp index 2a9776c3a1..2232019c19 100644 --- a/core/perf_test/BenchmarkMain.cpp +++ b/core/perf_test/BenchmarkMain.cpp @@ -14,15 +14,7 @@ // //@HEADER -// Avoid deprecation warning for ICC -#ifdef __INTEL_COMPILER -#pragma warning(push) -#pragma warning(disable : 1786) #include -#pragma warning(pop) -#else -#include -#endif #include "Benchmark_Context.hpp" #include diff --git a/core/perf_test/Benchmark_Context.hpp b/core/perf_test/Benchmark_Context.hpp index a389827104..e823b3a8ad 100644 --- a/core/perf_test/Benchmark_Context.hpp +++ b/core/perf_test/Benchmark_Context.hpp @@ -19,15 +19,7 @@ #include -// Avoid deprecation warning for ICC -#ifdef __INTEL_COMPILER -#pragma warning(push) -#pragma warning(disable : 1786) #include -#pragma warning(pop) -#else -#include -#endif #include #include diff --git a/core/perf_test/PerfTest_ViewCopy.hpp b/core/perf_test/PerfTest_ViewCopy.hpp index 412f794c4c..b7b1e1ad48 100644 --- a/core/perf_test/PerfTest_ViewCopy.hpp +++ b/core/perf_test/PerfTest_ViewCopy.hpp @@ -19,15 +19,7 @@ #include -// Avoid deprecation warning for ICC -#ifdef __INTEL_COMPILER -#pragma warning(push) -#pragma warning(disable : 1786) #include -#pragma warning(pop) -#else -#include -#endif #include "Benchmark_Context.hpp" #include From d25b94b11e08e26014ef9309951aa0a9e6bb577c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 7 Feb 2023 13:21:43 +0100 Subject: [PATCH 203/496] Remove unused variable --- core/perf_test/PerfTestGramSchmidt.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/perf_test/PerfTestGramSchmidt.cpp b/core/perf_test/PerfTestGramSchmidt.cpp index c6c47909d0..949cc07e6b 100644 --- a/core/perf_test/PerfTestGramSchmidt.cpp +++ b/core/perf_test/PerfTestGramSchmidt.cpp @@ -161,8 +161,6 @@ struct ModifiedGramSchmidt { template static void GramSchmidt(benchmark::State& state) { - std::string label_gramschmidt; - const int parallel_work_length = state.range(0); for (auto _ : state) { From b9d405ade0b6eec9206e8334d6508ee67e0e5f58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 8 Feb 2023 10:04:33 +0100 Subject: [PATCH 204/496] Fix unused function warning (SYCL) --- core/perf_test/PerfTest_ExecSpacePartitioning.cpp | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index 54d2ee0038..7115661d7d 100644 --- a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -54,7 +54,7 @@ bool is_overlapping(const Kokkos::HIP&) { } #endif -#ifdef KOKKOS_ENABLE_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> bool is_overlapping( const Kokkos::Experimental::SYCL&) { @@ -154,6 +154,8 @@ struct FunctorTeamReduce { } }; +// skip for SYCL+Cuda +#if !(defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)) static void OverlapRangePolicy(benchmark::State& state) { int N = state.range(0); int M = state.range(1); @@ -323,12 +325,9 @@ static void OverlapRangePolicy(benchmark::State& state) { } } -// skip for SYCL+Cuda -#if !defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ARCH_INTEL_GPU) BENCHMARK(OverlapRangePolicy) ->ArgNames({"N", "M", "R"}) ->Args({2'000, 10'000, 10}); -#endif static void OverlapMDRangePolicy(benchmark::State& state) { int N = state.range(0); @@ -518,12 +517,9 @@ static void OverlapMDRangePolicy(benchmark::State& state) { } } -// skip for SYCL+Cuda -#if !defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ARCH_INTEL_GPU) BENCHMARK(OverlapMDRangePolicy) ->ArgNames({"N", "M", "R"}) ->Args({200, 10'000, 10}); -#endif static void OverlapTeamPolicy(benchmark::State& state) { int N = state.range(0); @@ -698,11 +694,9 @@ static void OverlapTeamPolicy(benchmark::State& state) { } } -// skip for SYCL+Cuda -#if !defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ARCH_INTEL_GPU) BENCHMARK(OverlapTeamPolicy) ->ArgNames({"N", "M", "R"}) ->Args({20, 1'000'000, 10}); -#endif +#endif // skip for SYCL+Cuda } // namespace Test From b6c49a9deb62078e00daa0c914d917564fd5eb42 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 6 Feb 2023 19:53:03 +0100 Subject: [PATCH 205/496] Make HPX::concurrency() a non-static member function --- core/src/HPX/Kokkos_HPX.cpp | 4 +++ core/src/HPX/Kokkos_HPX.hpp | 36 +++++++++++---------- core/src/HPX/Kokkos_HPX_Task.hpp | 14 ++++---- core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp | 2 +- 4 files changed, 31 insertions(+), 25 deletions(-) diff --git a/core/src/HPX/Kokkos_HPX.cpp b/core/src/HPX/Kokkos_HPX.cpp index 11e11a4573..c4204f7402 100644 --- a/core/src/HPX/Kokkos_HPX.cpp +++ b/core/src/HPX/Kokkos_HPX.cpp @@ -153,7 +153,11 @@ void HPX::impl_static_fence(const std::string &name) { }); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 int HPX::concurrency() { +#else +int HPX::concurrency() const { +#endif hpx::runtime *rt = hpx::get_runtime_ptr(); if (rt == nullptr) { return hpx::threads::hardware_concurrency(); diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp index 1baa17b7ae..e1abaf8837 100644 --- a/core/src/HPX/Kokkos_HPX.hpp +++ b/core/src/HPX/Kokkos_HPX.hpp @@ -239,7 +239,11 @@ class HPX { } #endif +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); +#else + int concurrency() const; +#endif static void impl_initialize(InitializationSettings const &); static bool impl_is_initialized() noexcept; static void impl_finalize(); @@ -739,15 +743,13 @@ class TeamPolicyInternal Kokkos::abort("TeamPolicy blocking granularity must be power of two"); } else { int new_chunk_size = 1; - while (new_chunk_size * 4 * Kokkos::Experimental::HPX::concurrency() < - m_league_size) { + while (new_chunk_size * 4 * m_space.concurrency() < m_league_size) { new_chunk_size *= 2; } if (new_chunk_size < 128) { new_chunk_size = 1; - while ((new_chunk_size * Kokkos::Experimental::HPX::concurrency() < - m_league_size) && + while ((new_chunk_size * m_space.concurrency() < m_league_size) && (new_chunk_size < 128)) new_chunk_size *= 2; } @@ -1018,7 +1020,7 @@ class ParallelReduce, ReducerType, void setup() const { const std::size_t value_size = Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); buffer.resize(num_worker_threads, value_size); @@ -1051,7 +1053,7 @@ class ParallelReduce, ReducerType, hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); typename Analysis::Reducer final_reducer( &ReducerConditional::select(m_functor, m_reducer)); - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); for (int i = 1; i < num_worker_threads; ++i) { final_reducer.join(reinterpret_cast(buffer.get(0)), reinterpret_cast(buffer.get(i))); @@ -1146,7 +1148,7 @@ class ParallelReduce, ReducerType, void setup() const { const std::size_t value_size = Analysis::value_size( ReducerConditional::select(m_iter.m_func, m_reducer)); - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); typename Analysis::Reducer final_reducer( &ReducerConditional::select(m_iter.m_func, m_reducer)); @@ -1174,7 +1176,7 @@ class ParallelReduce, ReducerType, hpx_thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer(); typename Analysis::Reducer final_reducer( &ReducerConditional::select(m_iter.m_func, m_reducer)); - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); for (int i = 1; i < num_worker_threads; ++i) { final_reducer.join(reinterpret_cast(buffer.get(0)), reinterpret_cast(buffer.get(i))); @@ -1259,7 +1261,7 @@ class ParallelScan, public: void setup() const { - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); const std::size_t value_size = Analysis::value_size(m_functor); hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); @@ -1280,7 +1282,7 @@ class ParallelScan, } void execute_range(int t) const { - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); const int value_count = Analysis::value_count(m_functor); const std::size_t value_size = Analysis::value_size(m_functor); @@ -1331,7 +1333,7 @@ class ParallelScan, } void execute() const { - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); m_policy.space().impl_bulk_setup_finalize( false, is_light_weight_policy(), *this, num_worker_threads, hpx::threads::thread_stacksize::small_); @@ -1362,7 +1364,7 @@ class ParallelScanWithTotal, public: void setup() const { - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); const std::size_t value_size = Analysis::value_size(m_functor); hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); @@ -1383,7 +1385,7 @@ class ParallelScanWithTotal, } void execute_range(int t) const { - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); const int value_count = Analysis::value_count(m_functor); const std::size_t value_size = Analysis::value_size(m_functor); @@ -1438,7 +1440,7 @@ class ParallelScanWithTotal, } void execute() const { - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); m_policy.space().impl_bulk_setup_finalize( false, is_light_weight_policy(), *this, num_worker_threads, hpx::threads::thread_stacksize::small_); @@ -1478,7 +1480,7 @@ class ParallelFor, public: void setup() const { - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); buffer.resize(num_worker_threads, m_shared); @@ -1547,7 +1549,7 @@ class ParallelReduce, void setup() const { const std::size_t value_size = Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); buffer.resize(num_worker_threads, value_size + m_shared); @@ -1585,7 +1587,7 @@ class ParallelReduce, hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); typename Analysis::Reducer final_reducer( &ReducerConditional::select(m_functor, m_reducer)); - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = m_policy.space().concurrency(); const pointer_type ptr = reinterpret_cast(buffer.get(0)); for (int t = 1; t < num_worker_threads; ++t) { final_reducer.join(ptr, reinterpret_cast(buffer.get(t))); diff --git a/core/src/HPX/Kokkos_HPX_Task.hpp b/core/src/HPX/Kokkos_HPX_Task.hpp index f7a4f641e4..191e4cf9b3 100644 --- a/core/src/HPX/Kokkos_HPX_Task.hpp +++ b/core/src/HPX/Kokkos_HPX_Task.hpp @@ -40,7 +40,7 @@ class TaskQueueSpecialization< SimpleTaskScheduler> { public: void setup() const { - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = Kokkos::Experimental::HPX().concurrency(); hpx_thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer(); buffer.resize(num_worker_threads, 512); @@ -50,7 +50,7 @@ class TaskQueueSpecialization< // NOTE: This implementation has been simplified based on the // assumption that team_size = 1. The HPX backend currently only // supports a team size of 1. - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = Kokkos::Experimental::HPX().concurrency(); hpx_thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer(); @@ -96,7 +96,7 @@ class TaskQueueSpecialization< // This is not necessarily the most efficient, but can be improved later. TaskQueueSpecialization task_queue; task_queue.scheduler = &scheduler; - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = Kokkos::Experimental::HPX().concurrency(); Kokkos::Experimental::HPX().impl_bulk_setup_finalize( true, false, task_queue, num_worker_threads, hpx::threads::thread_stacksize::nostack); @@ -124,7 +124,7 @@ class TaskQueueSpecializationConstrained< Kokkos::Experimental::HPX>::value>> { public: void setup() const { - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = Kokkos::Experimental::HPX().concurrency(); hpx_thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer(); buffer.resize(num_worker_threads, 512); @@ -137,7 +137,7 @@ class TaskQueueSpecializationConstrained< // NOTE: This implementation has been simplified based on the // assumption that team_size = 1. The HPX backend currently only // supports a team size of 1. - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = Kokkos::Experimental::HPX().concurrency(); hpx_thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer(); @@ -194,7 +194,7 @@ class TaskQueueSpecializationConstrained< using task_base_type = typename scheduler_type::task_base; using queue_type = typename scheduler_type::queue_type; - if (1 == Kokkos::Experimental::HPX::concurrency()) { + if (1 == Kokkos::Experimental::HPX().concurrency()) { task_base_type *const end = (task_base_type *)task_base_type::EndTag; task_base_type *task = end; @@ -229,7 +229,7 @@ class TaskQueueSpecializationConstrained< // This is not necessarily the most efficient, but can be improved later. TaskQueueSpecializationConstrained task_queue; task_queue.scheduler = &scheduler; - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = Kokkos::Experimental::HPX().concurrency(); Kokkos::Experimental::HPX().impl_bulk_setup_finalize( true, false, task_queue, num_worker_threads, hpx::threads::thread_stacksize::nostack); diff --git a/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp b/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp index cb6d5a875e..f3fabcfee3 100644 --- a/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp +++ b/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp @@ -53,7 +53,7 @@ class ParallelFor, } void execute() const { - const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int num_worker_threads = Kokkos::Experimental::HPX().concurrency(); Kokkos::Experimental::HPX().impl_bulk_plain( true, is_light_weight_policy(), *this, num_worker_threads, hpx::threads::thread_stacksize::nostack); From 2caf64137c8219eb5f1efde63123ff8f857efd91 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 8 Feb 2023 14:47:56 -0500 Subject: [PATCH 206/496] add support to compile Kokkos for Ada generation (sm_89) consumer GPUs (RTX40x0) --- Makefile.kokkos | 9 ++++++++- cmake/KokkosCore_config.h.in | 1 + cmake/compile_tests/cuda_compute_capability.cc | 1 + cmake/kokkos_arch.cmake | 4 +++- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 25080c66e3..d51d023d56 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -10,7 +10,7 @@ KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MIN KOKKOS_DEVICES ?= "Threads" # Options: # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR -# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90 +# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: BGQ,Power7,Power8,Power9 # AMD-GPUS: Vega906,Vega908,Vega90A,Navi1030 @@ -342,6 +342,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75) KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80) KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86) +KOKKOS_INTERNAL_USE_ARCH_ADA89 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ada89) KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90) KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ @@ -357,6 +358,7 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \ + + $(KOKKOS_INTERNAL_USE_ARCH_ADA89) \ + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90)) #SEK: This seems like a bug to me @@ -1049,6 +1051,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_89 + endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 520c45137c..afd96a5f3a 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -106,6 +106,7 @@ #cmakedefine KOKKOS_ARCH_AMPERE #cmakedefine KOKKOS_ARCH_AMPERE80 #cmakedefine KOKKOS_ARCH_AMPERE86 +#cmakedefine KOKKOS_ARCH_ADA89 #cmakedefine KOKKOS_ARCH_HOPPER #cmakedefine KOKKOS_ARCH_HOPPER90 #cmakedefine KOKKOS_ARCH_AMD_ZEN diff --git a/cmake/compile_tests/cuda_compute_capability.cc b/cmake/compile_tests/cuda_compute_capability.cc index b0bc5d1520..b81c4218a9 100644 --- a/cmake/compile_tests/cuda_compute_capability.cc +++ b/cmake/compile_tests/cuda_compute_capability.cc @@ -46,6 +46,7 @@ int main() { case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break; case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break; case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break; + case 89: std::cout << "Set -DKokkos_ARCH_ADA89=ON ." << std::endl; break; case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break; default: std::cout << "Compute capability " << compute_capability diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 993e177629..e0f508b99c 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") +KOKKOS_ARCH_OPTION(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET) @@ -612,6 +613,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72) CHECK_CUDA_ARCH(TURING75 sm_75) CHECK_CUDA_ARCH(AMPERE80 sm_80) CHECK_CUDA_ARCH(AMPERE86 sm_86) +CHECK_CUDA_ARCH(ADA89 sm_89) CHECK_CUDA_ARCH(HOPPER90 sm_90) SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") @@ -866,7 +868,7 @@ IF (KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) SET(KOKKOS_ARCH_VOLTA ON) ENDIF() -IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) +IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86 OR KOKKOS_ARCH_ADA89) SET(KOKKOS_ARCH_AMPERE ON) ENDIF() From 1d228fa7465415769031df202c38f19097dc2b81 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 9 Feb 2023 11:03:16 -0500 Subject: [PATCH 207/496] Fix version macros --- cmake/KokkosCore_config.h.in | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 520c45137c..c8257d8664 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -9,10 +9,10 @@ // KOKKOS_VERSION % 100 is the patch level // KOKKOS_VERSION / 100 % 100 is the minor version // KOKKOS_VERSION / 10000 is the major version -#cmakedefine KOKKOS_VERSION @KOKKOS_VERSION@ -#cmakedefine KOKKOS_VERSION_MAJOR @KOKKOS_VERSION_MAJOR@ -#cmakedefine KOKKOS_VERSION_MINOR @KOKKOS_VERSION_MINOR@ -#cmakedefine KOKKOS_VERSION_PATCH @KOKKOS_VERSION_PATCH@ +#define KOKKOS_VERSION @KOKKOS_VERSION@ +#define KOKKOS_VERSION_MAJOR @KOKKOS_VERSION_MAJOR@ +#define KOKKOS_VERSION_MINOR @KOKKOS_VERSION_MINOR@ +#define KOKKOS_VERSION_PATCH @KOKKOS_VERSION_PATCH@ /* Execution Spaces */ #cmakedefine KOKKOS_ENABLE_SERIAL From 31750118d3f6ba9c313b32d0197a2000d421fce2 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 9 Feb 2023 09:19:27 -0800 Subject: [PATCH 208/496] Add compile-only test to make sure version macros are defined --- core/unit_test/CMakeLists.txt | 1 + core/unit_test/TestVersionMacros.cpp | 37 ++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 core/unit_test/TestVersionMacros.cpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index b18a23f986..ec748cfb03 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -77,6 +77,7 @@ SET(COMPILE_ONLY_SOURCES TestDetectionIdiom.cpp TestInterOp.cpp TestStringManipulation.cpp + TestVersionMacros.cpp TestViewTypeTraits.cpp TestTypeList.cpp view/TestExtentsDatatypeConversion.cpp diff --git a/core/unit_test/TestVersionMacros.cpp b/core/unit_test/TestVersionMacros.cpp new file mode 100644 index 0000000000..ef4574b992 --- /dev/null +++ b/core/unit_test/TestVersionMacros.cpp @@ -0,0 +1,37 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#ifndef KOKKOS_VERSION +static_assert(false, "KOKKOS_VERSION macro is not defined!"); +#endif + +#ifndef KOKKOS_VERSION_MAJOR +static_assert(false, "KOKKOS_VERSION_MAJOR macro is not defined!"); +#endif + +#ifndef KOKKOS_VERSION_MINOR +static_assert(false, "KOKKOS_VERSION_MINOR macro is not defined!"); +#endif + +#ifndef KOKKOS_VERSION_PATCH +static_assert(false, "KOKKOS_VERSION_PATCH macro is not defined!"); +#endif + +static_assert(KOKKOS_VERSION == KOKKOS_VERSION_MAJOR * 10000 + + KOKKOS_VERSION_MINOR * 100 + + KOKKOS_VERSION_PATCH); From b6cdada5b03f3af5bf54dbac20ac7ea4de99df94 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 9 Feb 2023 09:20:24 -0800 Subject: [PATCH 209/496] Also test the KOKKOS_VERSION_{LESS,GREATER,EQUAL} --- core/unit_test/TestVersionMacros.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/core/unit_test/TestVersionMacros.cpp b/core/unit_test/TestVersionMacros.cpp index ef4574b992..e18b597310 100644 --- a/core/unit_test/TestVersionMacros.cpp +++ b/core/unit_test/TestVersionMacros.cpp @@ -35,3 +35,25 @@ static_assert(false, "KOKKOS_VERSION_PATCH macro is not defined!"); static_assert(KOKKOS_VERSION == KOKKOS_VERSION_MAJOR * 10000 + KOKKOS_VERSION_MINOR * 100 + KOKKOS_VERSION_PATCH); + +// clang-format off +static_assert(!KOKKOS_VERSION_LESS (KOKKOS_VERSION_MAJOR , KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); +static_assert(!KOKKOS_VERSION_LESS (KOKKOS_VERSION_MAJOR - 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); +static_assert( KOKKOS_VERSION_LESS (KOKKOS_VERSION_MAJOR + 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); + +static_assert( KOKKOS_VERSION_LESS_EQUAL (KOKKOS_VERSION_MAJOR , KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); +static_assert(!KOKKOS_VERSION_LESS_EQUAL (KOKKOS_VERSION_MAJOR - 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); +static_assert( KOKKOS_VERSION_LESS_EQUAL (KOKKOS_VERSION_MAJOR + 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); + +static_assert(!KOKKOS_VERSION_GREATER (KOKKOS_VERSION_MAJOR , KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); +static_assert( KOKKOS_VERSION_GREATER (KOKKOS_VERSION_MAJOR - 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); +static_assert(!KOKKOS_VERSION_GREATER (KOKKOS_VERSION_MAJOR + 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); + +static_assert( KOKKOS_VERSION_GREATER_EQUAL (KOKKOS_VERSION_MAJOR , KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); +static_assert( KOKKOS_VERSION_GREATER_EQUAL (KOKKOS_VERSION_MAJOR - 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); +static_assert(!KOKKOS_VERSION_GREATER_EQUAL (KOKKOS_VERSION_MAJOR + 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); + +static_assert( KOKKOS_VERSION_EQUAL (KOKKOS_VERSION_MAJOR , KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); +static_assert(!KOKKOS_VERSION_EQUAL (KOKKOS_VERSION_MAJOR - 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); +static_assert(!KOKKOS_VERSION_EQUAL (KOKKOS_VERSION_MAJOR + 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH)); +// clang-format on From 2b532d1f9e5306504c7c17326ce5216540427900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 15 Feb 2023 23:08:52 +0100 Subject: [PATCH 210/496] Fix cache configuration in CI (#5871) * Fix cache keys in CI * Use correct ccache directory --- .github/workflows/continuous-integration-workflow.yml | 6 +++--- .github/workflows/performance-benchmark.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 0715911f31..9b62c42931 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -69,9 +69,9 @@ jobs: uses: actions/checkout@v3 - uses: actions/cache@v3 with: - path: ~/.ccache - key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${github.ref}-${{ github.sha }} - restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{github.ref}} + path: ~/.cache/ccache + key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{ github.ref }}-${{ github.sha }} + restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{ github.ref }} - name: maybe_disable_death_tests if: ${{ matrix.distro == 'fedora:rawhide' }} run: echo "GTEST_FILTER=-*DeathTest*" >> $GITHUB_ENV diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 6d7a0faa6b..9db41c0dca 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -23,9 +23,9 @@ jobs: uses: actions/checkout@v3 - uses: actions/cache@v3 with: - path: ~/.ccache - key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.openmp }}-${github.ref}-${{ github.sha }} - restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.openmp }}-${{github.ref}} + path: ~/.cache/ccache + key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }}-${{ github.ref }}-${{ github.sha }} + restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }}-${{ github.ref }} - name: Configure Kokkos run: | cmake -B builddir \ From 9fb2bbce3533edf57281aba5837fd4371c740d54 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 15 Feb 2023 21:57:40 -0500 Subject: [PATCH 211/496] Prefer View::{R -> r}ank --- core/src/Kokkos_CopyViews.hpp | 102 ++++++++++---------- core/src/Kokkos_Core_fwd.hpp | 2 +- core/src/Kokkos_View.hpp | 171 +++++++++++++++++----------------- 3 files changed, 139 insertions(+), 136 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 98a28646c4..64539d8ed8 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -534,7 +534,7 @@ void view_copy(const ExecutionSpace& space, const DstType& dst, "Kokkos::Impl::view_copy called with invalid execution space"); } else { // Figure out iteration order in case we need it - int64_t strides[DstType::Rank + 1]; + int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; if (Kokkos::is_layouttiled::value) { @@ -548,7 +548,7 @@ void view_copy(const ExecutionSpace& space, const DstType& dst, iterate = Kokkos::Iterate::Left; } else if (std::is_same::value) { - if (strides[0] > strides[DstType::Rank - 1]) + if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -566,26 +566,26 @@ void view_copy(const ExecutionSpace& space, const DstType& dst, Kokkos::Impl::ViewCopy< typename DstType::uniform_runtime_nomemspace_type, typename SrcType::uniform_runtime_const_nomemspace_type, - Kokkos::LayoutRight, ExecutionSpace, DstType::Rank, int64_t>( + Kokkos::LayoutRight, ExecutionSpace, DstType::rank, int64_t>( dst, src, space); else Kokkos::Impl::ViewCopy< typename DstType::uniform_runtime_nomemspace_type, typename SrcType::uniform_runtime_const_nomemspace_type, - Kokkos::LayoutLeft, ExecutionSpace, DstType::Rank, int64_t>( + Kokkos::LayoutLeft, ExecutionSpace, DstType::rank, int64_t>( dst, src, space); } else { if (iterate == Kokkos::Iterate::Right) Kokkos::Impl::ViewCopy< typename DstType::uniform_runtime_nomemspace_type, typename SrcType::uniform_runtime_const_nomemspace_type, - Kokkos::LayoutRight, ExecutionSpace, DstType::Rank, int>(dst, src, + Kokkos::LayoutRight, ExecutionSpace, DstType::rank, int>(dst, src, space); else Kokkos::Impl::ViewCopy< typename DstType::uniform_runtime_nomemspace_type, typename SrcType::uniform_runtime_const_nomemspace_type, - Kokkos::LayoutLeft, ExecutionSpace, DstType::Rank, int>(dst, src, + Kokkos::LayoutLeft, ExecutionSpace, DstType::rank, int>(dst, src, space); } } @@ -620,7 +620,7 @@ void view_copy(const DstType& dst, const SrcType& src) { } // Figure out iteration order in case we need it - int64_t strides[DstType::Rank + 1]; + int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; if (Kokkos::is_layouttiled::value) { @@ -634,7 +634,7 @@ void view_copy(const DstType& dst, const SrcType& src) { iterate = Kokkos::Iterate::Left; } else if (std::is_same::value) { - if (strides[0] > strides[DstType::Rank - 1]) + if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -653,26 +653,26 @@ void view_copy(const DstType& dst, const SrcType& src) { Kokkos::Impl::ViewCopy< typename DstType::uniform_runtime_nomemspace_type, typename SrcType::uniform_runtime_const_nomemspace_type, - Kokkos::LayoutRight, dst_execution_space, DstType::Rank, int64_t>( + Kokkos::LayoutRight, dst_execution_space, DstType::rank, int64_t>( dst, src); else Kokkos::Impl::ViewCopy< typename DstType::uniform_runtime_nomemspace_type, typename SrcType::uniform_runtime_const_nomemspace_type, - Kokkos::LayoutLeft, dst_execution_space, DstType::Rank, int64_t>( + Kokkos::LayoutLeft, dst_execution_space, DstType::rank, int64_t>( dst, src); } else { if (iterate == Kokkos::Iterate::Right) Kokkos::Impl::ViewCopy< typename DstType::uniform_runtime_nomemspace_type, typename SrcType::uniform_runtime_const_nomemspace_type, - Kokkos::LayoutRight, src_execution_space, DstType::Rank, int64_t>( + Kokkos::LayoutRight, src_execution_space, DstType::rank, int64_t>( dst, src); else Kokkos::Impl::ViewCopy< typename DstType::uniform_runtime_nomemspace_type, typename SrcType::uniform_runtime_const_nomemspace_type, - Kokkos::LayoutLeft, src_execution_space, DstType::Rank, int64_t>( + Kokkos::LayoutLeft, src_execution_space, DstType::rank, int64_t>( dst, src); } } else { @@ -681,26 +681,26 @@ void view_copy(const DstType& dst, const SrcType& src) { Kokkos::Impl::ViewCopy< typename DstType::uniform_runtime_nomemspace_type, typename SrcType::uniform_runtime_const_nomemspace_type, - Kokkos::LayoutRight, dst_execution_space, DstType::Rank, int>(dst, + Kokkos::LayoutRight, dst_execution_space, DstType::rank, int>(dst, src); else Kokkos::Impl::ViewCopy< typename DstType::uniform_runtime_nomemspace_type, typename SrcType::uniform_runtime_const_nomemspace_type, - Kokkos::LayoutLeft, dst_execution_space, DstType::Rank, int>(dst, + Kokkos::LayoutLeft, dst_execution_space, DstType::rank, int>(dst, src); } else { if (iterate == Kokkos::Iterate::Right) Kokkos::Impl::ViewCopy< typename DstType::uniform_runtime_nomemspace_type, typename SrcType::uniform_runtime_const_nomemspace_type, - Kokkos::LayoutRight, src_execution_space, DstType::Rank, int>(dst, + Kokkos::LayoutRight, src_execution_space, DstType::rank, int>(dst, src); else Kokkos::Impl::ViewCopy< typename DstType::uniform_runtime_nomemspace_type, typename SrcType::uniform_runtime_const_nomemspace_type, - Kokkos::LayoutLeft, src_execution_space, DstType::Rank, int>(dst, + Kokkos::LayoutLeft, src_execution_space, DstType::rank, int>(dst, src); } } @@ -832,7 +832,7 @@ struct CommonSubview + int Rank = DstType::rank> struct ViewRemap; template @@ -1310,7 +1310,7 @@ inline void contiguous_fill( using ViewTypeFlat = Kokkos::View< typename ViewType::value_type*, Kokkos::LayoutRight, Kokkos::Device>, Kokkos::MemoryTraits<0>>; @@ -1318,11 +1318,11 @@ inline void contiguous_fill( ViewTypeFlat dst_flat(dst.data(), dst.size()); if (dst.span() < static_cast(std::numeric_limits::max())) { Kokkos::Impl::ViewFill(dst_flat, value, + ViewTypeFlat::rank, int>(dst_flat, value, exec_space); } else Kokkos::Impl::ViewFill(dst_flat, value, + ViewTypeFlat::rank, int64_t>(dst_flat, value, exec_space); } @@ -1447,7 +1447,7 @@ inline void deep_copy( } // Figure out iteration order to do the ViewFill - int64_t strides[ViewType::Rank + 1]; + int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; if (std::is_same::value) { - if (strides[0] > strides[ViewType::Rank > 0 ? ViewType::Rank - 1 : 0]) + if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1473,26 +1473,26 @@ inline void deep_copy( // Lets call the right ViewFill functor based on integer space needed and // iteration type using ViewTypeUniform = - std::conditional_t; if (dst.span() > static_cast(std::numeric_limits::max())) { if (iterate == Kokkos::Iterate::Right) Kokkos::Impl::ViewFill( + exec_space_type, ViewType::rank, int64_t>( dst, value, exec_space_type()); else Kokkos::Impl::ViewFill( + exec_space_type, ViewType::rank, int64_t>( dst, value, exec_space_type()); } else { if (iterate == Kokkos::Iterate::Right) Kokkos::Impl::ViewFill( + exec_space_type, ViewType::rank, int>( dst, value, exec_space_type()); else Kokkos::Impl::ViewFill( + exec_space_type, ViewType::rank, int>( dst, value, exec_space_type()); } Kokkos::fence("Kokkos::deep_copy: scalar copy, post copy fence"); @@ -1636,19 +1636,19 @@ inline void deep_copy( "match: "); message += dst.label(); message += "("; - for (int r = 0; r < dst_type::Rank - 1; r++) { + for (int r = 0; r < dst_type::rank - 1; r++) { message += std::to_string(dst.extent(r)); message += ","; } - message += std::to_string(dst.extent(dst_type::Rank - 1)); + message += std::to_string(dst.extent(dst_type::rank - 1)); message += ") "; message += src.label(); message += "("; - for (int r = 0; r < src_type::Rank - 1; r++) { + for (int r = 0; r < src_type::rank - 1; r++) { message += std::to_string(src.extent(r)); message += ","; } - message += std::to_string(src.extent(src_type::Rank - 1)); + message += std::to_string(src.extent(src_type::rank - 1)); message += ") "; Kokkos::Impl::throw_runtime_exception(message); @@ -1719,19 +1719,19 @@ inline void deep_copy( "Deprecation Error: Kokkos::deep_copy extents of views don't match: "); message += dst.label(); message += "("; - for (int r = 0; r < dst_type::Rank - 1; r++) { + for (int r = 0; r < dst_type::rank - 1; r++) { message += std::to_string(dst.extent(r)); message += ","; } - message += std::to_string(dst.extent(dst_type::Rank - 1)); + message += std::to_string(dst.extent(dst_type::rank - 1)); message += ") "; message += src.label(); message += "("; - for (int r = 0; r < src_type::Rank - 1; r++) { + for (int r = 0; r < src_type::rank - 1; r++) { message += std::to_string(src.extent(r)); message += ","; } - message += std::to_string(src.extent(src_type::Rank - 1)); + message += std::to_string(src.extent(src_type::rank - 1)); message += ") "; Kokkos::Impl::throw_runtime_exception(message); @@ -2559,7 +2559,7 @@ inline void deep_copy( } else { using ViewType = View; // Figure out iteration order to do the ViewFill - int64_t strides[ViewType::Rank + 1]; + int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; if (std::is_same::value) { - if (strides[0] > strides[ViewType::Rank > 0 ? ViewType::Rank - 1 : 0]) + if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -2585,23 +2585,23 @@ inline void deep_copy( // Lets call the right ViewFill functor based on integer space needed and // iteration type using ViewTypeUniform = - std::conditional_t; if (dst.span() > static_cast(std::numeric_limits::max())) { if (iterate == Kokkos::Iterate::Right) Kokkos::Impl::ViewFill(dst, value, space); + ViewType::rank, int64_t>(dst, value, space); else Kokkos::Impl::ViewFill(dst, value, space); + ViewType::rank, int64_t>(dst, value, space); } else { if (iterate == Kokkos::Iterate::Right) Kokkos::Impl::ViewFill(dst, value, space); + ViewType::rank, int32_t>(dst, value, space); else Kokkos::Impl::ViewFill(dst, value, space); + ViewType::rank, int32_t>(dst, value, space); } } if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { @@ -2643,7 +2643,7 @@ inline void deep_copy( Impl::contiguous_fill_or_memset(fill_exec_space(), dst, value); } else { using ViewTypeUniform = std::conditional_t< - View::Rank == 0, + View::rank == 0, typename View::uniform_runtime_type, typename View::uniform_runtime_nomemspace_type>; Kokkos::Impl::ViewFill + int Rank = ViewType::rank, typename iType = int64_t> struct ViewFill; template { */ // KOKKOS_INLINE_FUNCTION // static - // constexpr unsigned rank() { return map_type::Rank; } + // constexpr unsigned rank() { return map_type::rank; } template KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< @@ -811,14 +811,14 @@ class View : public ViewTraits { template static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(Rank <= sizeof...(Is), ""); + static_assert(traits::rank <= sizeof...(Is), ""); static_assert(sizeof...(Is) <= 8, ""); static_assert(Kokkos::Impl::are_integral::value, ""); } template static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(Rank == sizeof...(Is), ""); + static_assert(traits::rank == sizeof...(Is), ""); static_assert(Kokkos::Impl::are_integral::value, ""); } @@ -827,22 +827,22 @@ class View : public ViewTraits { // Rank 1 default map operator() template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (1 == Rank) && is_default_map && !is_layout_stride), - reference_type> - operator()(I0 i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (1 == traits::rank) && is_default_map && !is_layout_stride), + reference_type> + operator()(I0 i0) const { check_operator_parens_valid_args(i0); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) return m_map.m_impl_handle[i0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (1 == Rank) && is_default_map && is_layout_stride), - reference_type> - operator()(I0 i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (1 == traits::rank) && is_default_map && is_layout_stride), + reference_type> + operator()(I0 i0) const { check_operator_parens_valid_args(i0); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; @@ -853,7 +853,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - ((1 == Rank) && Kokkos::Impl::are_integral::value && !is_default_map), + ((1 == traits::rank) && Kokkos::Impl::are_integral::value && + !is_default_map), reference_type> operator[](I0 i0) const { KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) @@ -861,21 +862,21 @@ class View : public ViewTraits { } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == Rank) && Kokkos::Impl::are_integral::value && - is_default_map && !is_layout_stride), - reference_type> - operator[](I0 i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + ((1 == traits::rank) && Kokkos::Impl::are_integral::value && + is_default_map && !is_layout_stride), + reference_type> + operator[](I0 i0) const { KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) return m_map.m_impl_handle[i0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == Rank) && Kokkos::Impl::are_integral::value && - is_default_map && is_layout_stride), - reference_type> - operator[](I0 i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + ((1 == traits::rank) && Kokkos::Impl::are_integral::value && + is_default_map && is_layout_stride), + reference_type> + operator[](I0 i0) const { KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; } @@ -886,8 +887,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && // - (2 == Rank) && is_default_map && is_layout_left && - (traits::rank_dynamic == 0)), + (2 == traits::rank) && is_default_map && + is_layout_left && (traits::rank_dynamic == 0)), reference_type> operator()(I0 i0, I1 i1) const { check_operator_parens_valid_args(i0, i1); @@ -898,8 +899,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && // - (2 == Rank) && is_default_map && is_layout_left && - (traits::rank_dynamic != 0)), + (2 == traits::rank) && is_default_map && + is_layout_left && (traits::rank_dynamic != 0)), reference_type> operator()(I0 i0, I1 i1) const { check_operator_parens_valid_args(i0, i1); @@ -910,8 +911,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && // - (2 == Rank) && is_default_map && is_layout_right && - (traits::rank_dynamic == 0)), + (2 == traits::rank) && is_default_map && + is_layout_right && (traits::rank_dynamic == 0)), reference_type> operator()(I0 i0, I1 i1) const { check_operator_parens_valid_args(i0, i1); @@ -922,8 +923,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && // - (2 == Rank) && is_default_map && is_layout_right && - (traits::rank_dynamic != 0)), + (2 == traits::rank) && is_default_map && + is_layout_right && (traits::rank_dynamic != 0)), reference_type> operator()(I0 i0, I1 i1) const { check_operator_parens_valid_args(i0, i1); @@ -932,11 +933,11 @@ class View : public ViewTraits { } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (2 == Rank) && is_default_map && is_layout_stride), - reference_type> - operator()(I0 i0, I1 i1) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 == traits::rank) && is_default_map && is_layout_stride), + reference_type> + operator()(I0 i0, I1 i1) const { check_operator_parens_valid_args(i0, i1); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + @@ -947,11 +948,12 @@ class View : public ViewTraits { // have "inlined" versions above template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 != Rank) && (1 != Rank) && (0 != Rank) && is_default_map), - reference_type> - operator()(Is... indices) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (2 != traits::rank) && (1 != traits::rank) && + (0 != traits::rank) && is_default_map), + reference_type> + operator()(Is... indices) const { check_operator_parens_valid_args(indices...); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; @@ -960,7 +962,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && // - ((0 == Rank) || !is_default_map)), + ((0 == traits::rank) || !is_default_map)), reference_type> operator()(Is... indices) const { check_operator_parens_valid_args(indices...); @@ -973,7 +975,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (0 == Rank)), reference_type> + (Kokkos::Impl::always_true::value && (0 == traits::rank)), + reference_type> access(Is... extra) const { check_access_member_function_valid_args(extra...); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) @@ -986,7 +989,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == Rank) && !is_default_map), + (1 == traits::rank) && !is_default_map), reference_type> access(I0 i0, Is... extra) const { check_access_member_function_valid_args(i0, extra...); @@ -995,22 +998,22 @@ class View : public ViewTraits { } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == Rank) && is_default_map && !is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (1 == traits::rank) && + is_default_map && !is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { check_access_member_function_valid_args(i0, extra...); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) return m_map.m_impl_handle[i0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == Rank) && is_default_map && is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (1 == traits::rank) && + is_default_map && is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { check_access_member_function_valid_args(i0, extra...); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; @@ -1022,7 +1025,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && - (2 == Rank) && !is_default_map), + (2 == traits::rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, Is... extra) const { check_access_member_function_valid_args(i0, i1, extra...); @@ -1032,7 +1035,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == Rank) && + (Kokkos::Impl::always_true::value && (2 == traits::rank) && is_default_map && is_layout_left && (traits::rank_dynamic == 0)), reference_type> access(I0 i0, I1 i1, Is... extra) const { @@ -1043,7 +1046,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == Rank) && + (Kokkos::Impl::always_true::value && (2 == traits::rank) && is_default_map && is_layout_left && (traits::rank_dynamic != 0)), reference_type> access(I0 i0, I1 i1, Is... extra) const { @@ -1054,7 +1057,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == Rank) && + (Kokkos::Impl::always_true::value && (2 == traits::rank) && is_default_map && is_layout_right && (traits::rank_dynamic == 0)), reference_type> access(I0 i0, I1 i1, Is... extra) const { @@ -1065,7 +1068,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == Rank) && + (Kokkos::Impl::always_true::value && (2 == traits::rank) && is_default_map && is_layout_right && (traits::rank_dynamic != 0)), reference_type> access(I0 i0, I1 i1, Is... extra) const { @@ -1075,11 +1078,11 @@ class View : public ViewTraits { } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (2 == Rank) && is_default_map && is_layout_stride), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (2 == traits::rank) && + is_default_map && is_layout_stride), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { check_access_member_function_valid_args(i0, i1, extra...); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + @@ -1092,7 +1095,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == Rank) && is_default_map), + (3 == traits::rank) && is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, extra...); @@ -1103,7 +1106,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == Rank) && !is_default_map), + (3 == traits::rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, extra...); @@ -1116,8 +1119,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (4 == Rank) && - is_default_map), + (Kokkos::Impl::always_true::value && + (4 == traits::rank) && is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, extra...); @@ -1127,8 +1130,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (4 == Rank) && - !is_default_map), + (Kokkos::Impl::always_true::value && + (4 == traits::rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, extra...); @@ -1143,7 +1146,7 @@ class View : public ViewTraits { typename... Is> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && - (5 == Rank) && is_default_map), + (5 == traits::rank) && is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); @@ -1156,7 +1159,7 @@ class View : public ViewTraits { typename... Is> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && - (5 == Rank) && !is_default_map), + (5 == traits::rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); @@ -1172,7 +1175,7 @@ class View : public ViewTraits { typename I5, typename... Is> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && - (6 == Rank) && is_default_map), + (6 == traits::rank) && is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); @@ -1185,7 +1188,7 @@ class View : public ViewTraits { typename I5, typename... Is> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && - (6 == Rank) && !is_default_map), + (6 == traits::rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); @@ -1201,7 +1204,7 @@ class View : public ViewTraits { typename I5, typename I6, typename... Is> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && - (7 == Rank) && is_default_map), + (7 == traits::rank) && is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, @@ -1215,7 +1218,7 @@ class View : public ViewTraits { typename I5, typename I6, typename... Is> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && - (7 == Rank) && !is_default_map), + (7 == traits::rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, @@ -1233,7 +1236,7 @@ class View : public ViewTraits { KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == Rank) && is_default_map), + (8 == traits::rank) && is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, Is... extra) const { @@ -1250,7 +1253,7 @@ class View : public ViewTraits { KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == Rank) && !is_default_map), + (8 == traits::rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, Is... extra) const { @@ -1693,7 +1696,7 @@ class View : public ViewTraits { */ template KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View& V) { - return V.Rank; + return V.rank; } // Temporary until added to view namespace Impl { @@ -1710,7 +1713,7 @@ struct RankDataType { template KOKKOS_FUNCTION std::enable_if_t< - N == View::Rank && + N == View::rank && std::is_same::specialize, void>::value, View> as_view_of_rank_n(View v) { @@ -1721,7 +1724,7 @@ as_view_of_rank_n(View v) { // never be called template KOKKOS_FUNCTION std::enable_if_t< - N != View::Rank && + N != View::rank && std::is_same::specialize, void>::value, View::value_type, N>::type, Args...>> @@ -1753,7 +1756,7 @@ KOKKOS_INLINE_FUNCTION , ViewTraits, Args...>::type subview(const View& src, Args... args) { - static_assert(View::Rank == sizeof...(Args), + static_assert(View::rank == sizeof...(Args), "subview requires one argument for each source View rank"); return typename Kokkos::Impl::ViewMapping< @@ -1768,7 +1771,7 @@ KOKKOS_INLINE_FUNCTION typename Kokkos::Impl::ViewMapping< , ViewTraits, Args...>::template apply::type subview(const View& src, Args... args) { - static_assert(View::Rank == sizeof...(Args), + static_assert(View::rank == sizeof...(Args), "subview requires one argument for each source View rank"); return typename Kokkos::Impl::ViewMapping< From 2840e8d70b3c7db93588243328f3ed3e19b0788f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 8 Feb 2023 21:45:09 -0800 Subject: [PATCH 212/496] View::{R -> r}ank in algorithms and containers --- algorithms/src/Kokkos_Random.hpp | 2 +- algorithms/src/Kokkos_Sort.hpp | 2 +- containers/src/Kokkos_DynRankView.hpp | 6 +++--- containers/src/Kokkos_OffsetView.hpp | 18 +++++++++--------- containers/src/Kokkos_ScatterView.hpp | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/algorithms/src/Kokkos_Random.hpp b/algorithms/src/Kokkos_Random.hpp index 91e9ce6fc8..abb028d28e 100644 --- a/algorithms/src/Kokkos_Random.hpp +++ b/algorithms/src/Kokkos_Random.hpp @@ -1514,7 +1514,7 @@ void fill_random(const ExecutionSpace& exec, ViewType a, RandomPool g, "Kokkos::fill_random", Kokkos::RangePolicy(exec, 0, (LDA + 127) / 128), Impl::fill_random_functor_begin_end( + ViewType::rank, IndexType>( a, g, begin, end)); } diff --git a/algorithms/src/Kokkos_Sort.hpp b/algorithms/src/Kokkos_Sort.hpp index 8f3c6e35b6..cb6800409a 100644 --- a/algorithms/src/Kokkos_Sort.hpp +++ b/algorithms/src/Kokkos_Sort.hpp @@ -75,7 +75,7 @@ namespace Kokkos { namespace Impl { -template +template struct CopyOp; template diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index ce433b0bfc..0ff6b43f50 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -346,7 +346,7 @@ class ViewMapping< dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle::assign( src.m_map.m_impl_handle, src.m_track.m_tracker); dst.m_track.assign(src.m_track.m_tracker, DstTraits::is_managed); - dst.m_rank = src.Rank; + dst.m_rank = src.rank; } }; @@ -1025,7 +1025,7 @@ class DynRankView : public ViewTraits { // Copy/Assign View to DynRankView template KOKKOS_INLINE_FUNCTION DynRankView(const View& rhs) - : m_track(), m_map(), m_rank(rhs.Rank) { + : m_track(), m_map(), m_rank(rhs.rank) { using SrcTraits = typename View::traits; using Mapping = Kokkos::Impl::ViewMapping -struct DynRankViewFill> { +struct DynRankViewFill> { DynRankViewFill(const OutputView& dst, const typename OutputView::const_value_type& src) { Kokkos::Impl::DeepCopy { "Incompatible OffsetView copy construction"); Mapping::assign(m_map, aview.impl_map(), m_track); - for (int i = 0; i < aview.Rank; ++i) { + for (int i = 0; i < aview.rank; ++i) { m_begins[i] = 0; } } @@ -1301,7 +1301,7 @@ KOKKOS_INLINE_FUNCTION Kokkos::Impl::ViewMapping, T>::type::Rank; + ViewTraits, T>::type::rank; auto theSubview = Kokkos::subview(theView, shiftedArg); @@ -1340,7 +1340,7 @@ KOKKOS_INLINE_FUNCTION Kokkos::Impl::ViewMapping, T0, T1>::type::Rank; + ViewTraits, T0, T1>::type::rank; Kokkos::Array subviewBegins; size_t counter = 0; @@ -1381,7 +1381,7 @@ KOKKOS_INLINE_FUNCTION Kokkos::Impl::ViewMapping, T0, T1, T2>::type::Rank; + ViewTraits, T0, T1, T2>::type::rank; Kokkos::Array subviewBegins; @@ -1426,7 +1426,7 @@ KOKKOS_INLINE_FUNCTION constexpr size_t rank = Kokkos::Impl::ViewMapping< void /* deduce subview type from source view traits */ , - ViewTraits, T0, T1, T2, T3>::type::Rank; + ViewTraits, T0, T1, T2, T3>::type::rank; Kokkos::Array subviewBegins; size_t counter = 0; @@ -1473,7 +1473,7 @@ KOKKOS_INLINE_FUNCTION constexpr size_t rank = Kokkos::Impl::ViewMapping< void /* deduce subview type from source view traits */ , - ViewTraits, T0, T1, T2, T3, T4>::type::Rank; + ViewTraits, T0, T1, T2, T3, T4>::type::rank; Kokkos::Array subviewBegins; size_t counter = 0; @@ -1525,7 +1525,7 @@ KOKKOS_INLINE_FUNCTION constexpr size_t rank = Kokkos::Impl::ViewMapping< void /* deduce subview type from source view traits */ , - ViewTraits, T0, T1, T2, T3, T4, T5>::type::Rank; + ViewTraits, T0, T1, T2, T3, T4, T5>::type::rank; Kokkos::Array subviewBegins; @@ -1580,7 +1580,7 @@ KOKKOS_INLINE_FUNCTION constexpr size_t rank = Kokkos::Impl::ViewMapping< void /* deduce subview type from source view traits */ , - ViewTraits, T0, T1, T2, T3, T4, T5, T6>::type::Rank; + ViewTraits, T0, T1, T2, T3, T4, T5, T6>::type::rank; Kokkos::Array subviewBegins; @@ -1639,7 +1639,7 @@ KOKKOS_INLINE_FUNCTION constexpr size_t rank = Kokkos::Impl::ViewMapping< void /* deduce subview type from source view traits */ , - ViewTraits, T0, T1, T2, T3, T4, T5, T6, T7>::type::Rank; + ViewTraits, T0, T1, T2, T3, T4, T5, T6, T7>::type::rank; Kokkos::Array subviewBegins; diff --git a/containers/src/Kokkos_ScatterView.hpp b/containers/src/Kokkos_ScatterView.hpp index dbcab7c7e5..527ab36aae 100644 --- a/containers/src/Kokkos_ScatterView.hpp +++ b/containers/src/Kokkos_ScatterView.hpp @@ -1019,7 +1019,7 @@ class ScatterView::value_type subview() const { return Kokkos::Impl::Experimental::Slice< - Kokkos::LayoutRight, internal_view_type::Rank, + Kokkos::LayoutRight, internal_view_type::rank, internal_view_type>::get(internal_view, 0); } From 8487a9669eb4beb70c82ed579c12a487e0c36245 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 8 Feb 2023 21:45:25 -0800 Subject: [PATCH 213/496] View::{R -> r}ank in unit tests --- containers/unit_tests/TestDynViewAPI.hpp | 6 +++--- core/unit_test/TestAggregate.hpp | 4 ++-- core/unit_test/TestViewMapping_a.hpp | 6 +++--- core/unit_test/TestViewSubview.hpp | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/containers/unit_tests/TestDynViewAPI.hpp b/containers/unit_tests/TestDynViewAPI.hpp index c7a5b13dbb..4ecb6cf25c 100644 --- a/containers/unit_tests/TestDynViewAPI.hpp +++ b/containers/unit_tests/TestDynViewAPI.hpp @@ -1200,19 +1200,19 @@ class TestDynViewAPI { View7 vtest1("vtest1", 2, 2, 2, 2, 2, 2, 2); dView0 dfromv1(vtest1); - ASSERT_EQ(dfromv1.rank(), vtest1.Rank); + ASSERT_EQ(dfromv1.rank(), vtest1.rank); ASSERT_EQ(dfromv1.extent(0), vtest1.extent(0)); ASSERT_EQ(dfromv1.extent(1), vtest1.extent(1)); ASSERT_EQ(dfromv1.use_count(), vtest1.use_count()); dView0 dfromv2(vcast); - ASSERT_EQ(dfromv2.rank(), vcast.Rank); + ASSERT_EQ(dfromv2.rank(), vcast.rank); ASSERT_EQ(dfromv2.extent(0), vcast.extent(0)); ASSERT_EQ(dfromv2.extent(1), vcast.extent(1)); ASSERT_EQ(dfromv2.use_count(), vcast.use_count()); dView0 dfromv3 = vcast1; - ASSERT_EQ(dfromv3.rank(), vcast1.Rank); + ASSERT_EQ(dfromv3.rank(), vcast1.rank); ASSERT_EQ(dfromv3.extent(0), vcast1.extent(0)); ASSERT_EQ(dfromv3.extent(1), vcast1.extent(1)); ASSERT_EQ(dfromv3.use_count(), vcast1.use_count()); diff --git a/core/unit_test/TestAggregate.hpp b/core/unit_test/TestAggregate.hpp index 23cc5860ac..4f67b2eddc 100644 --- a/core/unit_test/TestAggregate.hpp +++ b/core/unit_test/TestAggregate.hpp @@ -56,8 +56,8 @@ void TestViewAggregate() { ""); static_assert(std::is_same::value, ""); - static_assert(a32_type::Rank == 2, ""); - static_assert(a32_flat_type::Rank == 3, ""); + static_assert(a32_type::rank == 2, ""); + static_assert(a32_flat_type::rank == 3, ""); a32_type x("test", 4, 5); a32_flat_type y(x); diff --git a/core/unit_test/TestViewMapping_a.hpp b/core/unit_test/TestViewMapping_a.hpp index e85dfd0472..0f24f715b9 100644 --- a/core/unit_test/TestViewMapping_a.hpp +++ b/core/unit_test/TestViewMapping_a.hpp @@ -713,7 +713,7 @@ void test_view_mapping() { typename Space::memory_space>::value)); ASSERT_TRUE((std::is_same::value)); - ASSERT_EQ(T::Rank, 1); + ASSERT_EQ(T::rank, 1); ASSERT_TRUE((std::is_same::value)); ASSERT_TRUE((std::is_same::value)); @@ -734,7 +734,7 @@ void test_view_mapping() { typename Space::memory_space>::value)); ASSERT_TRUE((std::is_same::value)); - ASSERT_EQ(C::Rank, 1); + ASSERT_EQ(C::rank, 1); ASSERT_EQ(vr1.extent(0), size_t(N)); @@ -781,7 +781,7 @@ void test_view_mapping() { ASSERT_TRUE((std::is_same::value)); ASSERT_TRUE((std::is_same::value)); - ASSERT_EQ(T::Rank, 1); + ASSERT_EQ(T::rank, 1); ASSERT_EQ(vr1.extent(0), size_t(N)); diff --git a/core/unit_test/TestViewSubview.hpp b/core/unit_test/TestViewSubview.hpp index f33b5611bf..54ac0036de 100644 --- a/core/unit_test/TestViewSubview.hpp +++ b/core/unit_test/TestViewSubview.hpp @@ -865,7 +865,7 @@ struct FillView_3D { using exec_t = typename Space::execution_space; using view_t = Kokkos::View; using rank_t = Kokkos::Rank< - view_t::Rank, + view_t::rank, std::is_same::value ? Kokkos::Iterate::Left : Kokkos::Iterate::Right, std::is_same::value ? Kokkos::Iterate::Left @@ -893,7 +893,7 @@ struct FillView_4D { using exec_t = typename Space::execution_space; using view_t = Kokkos::View; using rank_t = Kokkos::Rank< - view_t::Rank, + view_t::rank, std::is_same::value ? Kokkos::Iterate::Left : Kokkos::Iterate::Right, std::is_same::value ? Kokkos::Iterate::Left @@ -922,7 +922,7 @@ struct FillView_5D { using exec_t = typename Space::execution_space; using view_t = Kokkos::View; using rank_t = Kokkos::Rank< - view_t::Rank, + view_t::rank, std::is_same::value ? Kokkos::Iterate::Left : Kokkos::Iterate::Right, std::is_same::value ? Kokkos::Iterate::Left From 05416c9847e1332a831889c25a3f65b41718478e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 8 Feb 2023 22:17:08 -0800 Subject: [PATCH 214/496] View::{R -> r}ank in perf tests --- core/perf_test/PerfTestBlasKernels.hpp | 28 +++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/core/perf_test/PerfTestBlasKernels.hpp b/core/perf_test/PerfTestBlasKernels.hpp index 1d7073fe5a..5e6e52f115 100644 --- a/core/perf_test/PerfTestBlasKernels.hpp +++ b/core/perf_test/PerfTestBlasKernels.hpp @@ -25,8 +25,8 @@ template struct Dot { using execution_space = typename Type::execution_space; - static_assert(static_cast(Type::Rank) == static_cast(1), - "Dot static_assert Fail: Rank != 1"); + static_assert(static_cast(Type::rank) == static_cast(1), + "Dot static_assert Fail: rank != 1"); using value_type = double; @@ -56,8 +56,8 @@ template struct DotSingle { using execution_space = typename Type::execution_space; - static_assert(static_cast(Type::Rank) == static_cast(1), - "DotSingle static_assert Fail: Rank != 1"); + static_assert(static_cast(Type::rank) == static_cast(1), + "DotSingle static_assert Fail: rank != 1"); using value_type = double; @@ -88,13 +88,13 @@ template struct Scale { using execution_space = typename VectorType::execution_space; - static_assert(static_cast(ScalarType::Rank) == + static_assert(static_cast(ScalarType::rank) == static_cast(0), - "Scale static_assert Fail: ScalarType::Rank != 0"); + "Scale static_assert Fail: ScalarType::rank != 0"); - static_assert(static_cast(VectorType::Rank) == + static_assert(static_cast(VectorType::rank) == static_cast(1), - "Scale static_assert Fail: VectorType::Rank != 1"); + "Scale static_assert Fail: VectorType::rank != 1"); #if 1 typename ScalarType::const_type alpha; @@ -115,17 +115,17 @@ template struct AXPBY { using execution_space = typename VectorType::execution_space; - static_assert(static_cast(ScalarType::Rank) == + static_assert(static_cast(ScalarType::rank) == static_cast(0), - "AXPBY static_assert Fail: ScalarType::Rank != 0"); + "AXPBY static_assert Fail: ScalarType::rank != 0"); - static_assert(static_cast(ConstVectorType::Rank) == + static_assert(static_cast(ConstVectorType::rank) == static_cast(1), - "AXPBY static_assert Fail: ConstVectorType::Rank != 1"); + "AXPBY static_assert Fail: ConstVectorType::rank != 1"); - static_assert(static_cast(VectorType::Rank) == + static_assert(static_cast(VectorType::rank) == static_cast(1), - "AXPBY static_assert Fail: VectorType::Rank != 1"); + "AXPBY static_assert Fail: VectorType::rank != 1"); #if 1 typename ScalarType::const_type alpha, beta; From e348b6972eea21f1aff221f82b5b2217f59dc817 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 8 Feb 2023 22:11:58 -0800 Subject: [PATCH 215/496] Deprecate View::Rank --- core/src/Kokkos_View.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index c81c83576c..c742ca5bda 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -644,7 +644,10 @@ class View : public ViewTraits { //---------------------------------------- // Domain rank and extents - enum { Rank = map_type::Rank }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = + map_type::Rank}; +#endif /** \brief rank() to be implemented */ From 2e53f1c3d3e6364dd6a846307cf5180cc16e2daf Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 16 Feb 2023 09:58:14 -0500 Subject: [PATCH 216/496] Add Impl::integral_constant --- core/src/impl/Kokkos_Utilities.hpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/core/src/impl/Kokkos_Utilities.hpp b/core/src/impl/Kokkos_Utilities.hpp index 71035dfeaa..67021c134c 100644 --- a/core/src/impl/Kokkos_Utilities.hpp +++ b/core/src/impl/Kokkos_Utilities.hpp @@ -29,6 +29,23 @@ namespace Kokkos { namespace Impl { +// same as std::integral_constant but with __host__ __device__ annotations on +// the implicit conversion function and the call operator +template +struct integral_constant { + using value_type = T; + using type = integral_constant; + static constexpr T value = v; + KOKKOS_FUNCTION constexpr operator value_type() const noexcept { + return value; + } + KOKKOS_FUNCTION constexpr value_type operator()() const noexcept { + return value; + } +}; + +//============================================================================== + template struct always_true : std::true_type {}; From 314b96614af25dedf9eb41713cb595131f4bf739 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 16 Feb 2023 17:15:52 -0500 Subject: [PATCH 217/496] Add View::rank[_dynamic] static constexpr data members --- core/src/Kokkos_View.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index c742ca5bda..fe5d267f09 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -34,6 +34,7 @@ static_assert(false, #include #include +#include // Impl::integral_constant #ifdef KOKKOS_ENABLE_IMPL_MDSPAN #include @@ -644,6 +645,11 @@ class View : public ViewTraits { //---------------------------------------- // Domain rank and extents + static constexpr Impl::integral_constant + rank = {}; + static constexpr Impl::integral_constant + rank_dynamic = {}; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = map_type::Rank}; From 10fae1f383213b9cba377e5b40ec6c45d8b687ad Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 16 Feb 2023 17:17:46 -0500 Subject: [PATCH 218/496] Fixup update Kokkos::rank(View) free function and drop outdated comment --- core/src/Kokkos_View.hpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index fe5d267f09..e576c8d42a 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -655,12 +655,6 @@ class View : public ViewTraits { map_type::Rank}; #endif - /** \brief rank() to be implemented - */ - // KOKKOS_INLINE_FUNCTION - // static - // constexpr unsigned rank() { return map_type::rank; } - template KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< std::is_integral::value, size_t> @@ -1699,14 +1693,10 @@ class View : public ViewTraits { } }; -/** \brief Temporary free function rank() - * until rank() is implemented - * in the View - */ template KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View& V) { - return V.rank; -} // Temporary until added to view + return V.rank(); +} namespace Impl { From a7daa592b9ae453f26a0c31de6f763a7673d7a13 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 16 Feb 2023 08:59:37 -0500 Subject: [PATCH 219/496] Fix printing extents and rank in error message when copying views --- core/src/Kokkos_CopyViews.hpp | 48 +++++++++++++++++------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 64539d8ed8..7bc07ff54c 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -1636,19 +1636,19 @@ inline void deep_copy( "match: "); message += dst.label(); message += "("; - for (int r = 0; r < dst_type::rank - 1; r++) { - message += std::to_string(dst.extent(r)); + message += std::to_string(dst.extent(0)); + for (size_t r = 1; r < dst_type::rank; r++) { message += ","; + message += std::to_string(dst.extent(r)); } - message += std::to_string(dst.extent(dst_type::rank - 1)); message += ") "; message += src.label(); message += "("; - for (int r = 0; r < src_type::rank - 1; r++) { - message += std::to_string(src.extent(r)); + message += std::to_string(src.extent(0)); + for (size_t r = 1; r < src_type::rank; r++) { message += ","; + message += std::to_string(src.extent(r)); } - message += std::to_string(src.extent(src_type::rank - 1)); message += ") "; Kokkos::Impl::throw_runtime_exception(message); @@ -1719,19 +1719,19 @@ inline void deep_copy( "Deprecation Error: Kokkos::deep_copy extents of views don't match: "); message += dst.label(); message += "("; - for (int r = 0; r < dst_type::rank - 1; r++) { - message += std::to_string(dst.extent(r)); + message += std::to_string(dst.extent(0)); + for (size_t r = 1; r < dst_type::rank; r++) { message += ","; + message += std::to_string(dst.extent(r)); } - message += std::to_string(dst.extent(dst_type::rank - 1)); message += ") "; message += src.label(); message += "("; - for (int r = 0; r < src_type::rank - 1; r++) { - message += std::to_string(src.extent(r)); + message += std::to_string(src.extent(0)); + for (size_t r = 1; r < src_type::rank; r++) { message += ","; + message += std::to_string(src.extent(r)); } - message += std::to_string(src.extent(src_type::rank - 1)); message += ") "; Kokkos::Impl::throw_runtime_exception(message); @@ -2800,19 +2800,19 @@ inline void deep_copy( "match: "); message += dst.label(); message += "("; - for (int r = 0; r < dst_type::rank - 1; r++) { - message += std::to_string(dst.extent(r)); + message += std::to_string(dst.extent(0)); + for (size_t r = 1; r < dst_type::rank; r++) { message += ","; + message += std::to_string(dst.extent(r)); } - message += std::to_string(dst.extent(dst_type::rank - 1)); message += ") "; message += src.label(); message += "("; - for (int r = 0; r < src_type::rank - 1; r++) { - message += std::to_string(src.extent(r)); + message += std::to_string(src.extent(0)); + for (size_t r = 1; r < src_type::rank; r++) { message += ","; + message += std::to_string(src.extent(r)); } - message += std::to_string(src.extent(src_type::rank - 1)); message += ") "; Kokkos::Impl::throw_runtime_exception(message); @@ -2869,19 +2869,19 @@ inline void deep_copy( "Deprecation Error: Kokkos::deep_copy extents of views don't match: "); message += dst.label(); message += "("; - for (int r = 0; r < dst_type::rank - 1; r++) { - message += std::to_string(dst.extent(r)); + message += std::to_string(dst.extent(0)); + for (size_t r = 1; r < dst_type::rank; r++) { message += ","; + message += std::to_string(dst.extent(r)); } - message += std::to_string(dst.extent(dst_type::rank - 1)); message += ") "; message += src.label(); message += "("; - for (int r = 0; r < src_type::rank - 1; r++) { - message += std::to_string(src.extent(r)); + message += std::to_string(src.extent(0)); + for (size_t r = 1; r < src_type::rank; r++) { message += ","; + message += std::to_string(src.extent(r)); } - message += std::to_string(src.extent(src_type::rank - 1)); message += ") "; Kokkos::Impl::throw_runtime_exception(message); From 4286774d1bd4a1b24599735b49f7a1cd3fbd6ba5 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 16 Feb 2023 11:39:40 -0500 Subject: [PATCH 220/496] Fix warning comparison of integers of different signs --- containers/src/Kokkos_OffsetView.hpp | 2 +- core/unit_test/TestViewAPI_e.hpp | 4 ++-- core/unit_test/TestViewMapping_a.hpp | 6 +++--- core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 173e30b1f3..ed5d7574bb 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -827,7 +827,7 @@ class OffsetView : public ViewTraits { "Incompatible OffsetView copy construction"); Mapping::assign(m_map, aview.impl_map(), m_track); - for (int i = 0; i < aview.rank; ++i) { + for (size_t i = 0; i < aview.rank; ++i) { m_begins[i] = 0; } } diff --git a/core/unit_test/TestViewAPI_e.hpp b/core/unit_test/TestViewAPI_e.hpp index df66396ab8..2e416d0320 100644 --- a/core/unit_test/TestViewAPI_e.hpp +++ b/core/unit_test/TestViewAPI_e.hpp @@ -100,7 +100,7 @@ void test_left_stride(Extents... extents) { size_t expected_stride = 1; size_t all_strides[view_type::rank + 1]; view.stride(all_strides); - for (int i = 0; i < view_type::rank; ++i) { + for (size_t i = 0; i < view_type::rank; ++i) { ASSERT_EQ(view.stride(i), expected_stride); ASSERT_EQ(all_strides[i], expected_stride); expected_stride *= view.extent(i); @@ -115,7 +115,7 @@ void test_right_stride(Extents... extents) { size_t expected_stride = 1; size_t all_strides[view_type::rank + 1]; view.stride(all_strides); - for (int ri = 0; ri < view_type::rank; ++ri) { + for (size_t ri = 0; ri < view_type::rank; ++ri) { auto i = view_type::rank - 1 - ri; ASSERT_EQ(view.stride(i), expected_stride); ASSERT_EQ(all_strides[i], expected_stride); diff --git a/core/unit_test/TestViewMapping_a.hpp b/core/unit_test/TestViewMapping_a.hpp index 0f24f715b9..9173f0d431 100644 --- a/core/unit_test/TestViewMapping_a.hpp +++ b/core/unit_test/TestViewMapping_a.hpp @@ -713,7 +713,7 @@ void test_view_mapping() { typename Space::memory_space>::value)); ASSERT_TRUE((std::is_same::value)); - ASSERT_EQ(T::rank, 1); + ASSERT_EQ(T::rank, size_t(1)); ASSERT_TRUE((std::is_same::value)); ASSERT_TRUE((std::is_same::value)); @@ -734,7 +734,7 @@ void test_view_mapping() { typename Space::memory_space>::value)); ASSERT_TRUE((std::is_same::value)); - ASSERT_EQ(C::rank, 1); + ASSERT_EQ(C::rank, size_t(1)); ASSERT_EQ(vr1.extent(0), size_t(N)); @@ -781,7 +781,7 @@ void test_view_mapping() { ASSERT_TRUE((std::is_same::value)); ASSERT_TRUE((std::is_same::value)); - ASSERT_EQ(T::rank, 1); + ASSERT_EQ(T::rank, size_t(1)); ASSERT_EQ(vr1.extent(0), size_t(N)); diff --git a/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp b/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp index 96fffa0dc7..f179fe85ed 100644 --- a/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp +++ b/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp @@ -113,10 +113,10 @@ TYPED_TEST(TestViewAPI, sizes) { static_assert(view_t::rank == TestFixture::expected_rank, "TestViewAPI: Error: rank mismatch"); size_t expected_span = 1; - for (int r = 0; r < view_t::rank; r++) expected_span *= this->all_sizes[r]; + for (size_t r = 0; r < view_t::rank; r++) expected_span *= this->all_sizes[r]; EXPECT_EQ(expected_span, a.span()); - for (int r = 0; r < view_t::rank; r++) { + for (size_t r = 0; r < view_t::rank; r++) { EXPECT_EQ(this->all_sizes[r], a.extent(r)); EXPECT_EQ(this->all_sizes[r], size_t(a.extent_int(r))); } From c43e45ecfdbe518b1fe8a008f9e71958a7675f36 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 13 Feb 2023 11:37:52 -0500 Subject: [PATCH 221/496] Remove Aligned memory trait when creating subviews --- core/src/Kokkos_MemoryTraits.hpp | 24 ++++++----- core/src/Kokkos_View.hpp | 60 ++++++++++++++++++---------- core/src/impl/Kokkos_ViewMapping.hpp | 30 +++----------- core/unit_test/TestViewSubview.hpp | 24 +++++++++++ 4 files changed, 83 insertions(+), 55 deletions(-) diff --git a/core/src/Kokkos_MemoryTraits.hpp b/core/src/Kokkos_MemoryTraits.hpp index 762e1a4a5d..fc5fc971b6 100644 --- a/core/src/Kokkos_MemoryTraits.hpp +++ b/core/src/Kokkos_MemoryTraits.hpp @@ -48,17 +48,19 @@ template struct MemoryTraits { //! Tag this class as a kokkos memory traits: using memory_traits = MemoryTraits; - enum : bool { - is_unmanaged = (unsigned(0) != (T & unsigned(Kokkos::Unmanaged))) - }; - enum : bool { - is_random_access = (unsigned(0) != (T & unsigned(Kokkos::RandomAccess))) - }; - enum : bool { is_atomic = (unsigned(0) != (T & unsigned(Kokkos::Atomic))) }; - enum : bool { - is_restrict = (unsigned(0) != (T & unsigned(Kokkos::Restrict))) - }; - enum : bool { is_aligned = (unsigned(0) != (T & unsigned(Kokkos::Aligned))) }; + + static constexpr unsigned value = T; + + static constexpr bool is_unmanaged = + (unsigned(0) != (T & unsigned(Kokkos::Unmanaged))); + static constexpr bool is_random_access = + (unsigned(0) != (T & unsigned(Kokkos::RandomAccess))); + static constexpr bool is_atomic = + (unsigned(0) != (T & unsigned(Kokkos::Atomic))); + static constexpr bool is_restrict = + (unsigned(0) != (T & unsigned(Kokkos::Restrict))); + static constexpr bool is_aligned = + (unsigned(0) != (T & unsigned(Kokkos::Aligned))); }; } // namespace Kokkos diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index c742ca5bda..3a7f2237b8 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -34,6 +34,7 @@ static_assert(false, #include #include +#include #ifdef KOKKOS_ENABLE_IMPL_MDSPAN #include @@ -1745,45 +1746,64 @@ void apply_to_view_of_static_rank(Function&& f, View a) { //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -template -using Subview = - typename Kokkos::Impl::ViewMapping::type; +namespace Impl { +template +struct TypeListToViewTraits; + +template +struct TypeListToViewTraits> { + using type = ViewTraits; +}; + +// It is not safe to assume that subviews of views with the Aligned memory trait +// are also aligned. Hence, just remove that attribute for subviews. +template +struct RemoveAlignedMemoryTrait { + private: + using type_list_in = Kokkos::Impl::type_list; + using memory_traits = typename ViewTraits::memory_traits; + using type_list_in_wo_memory_traits = + typename Kokkos::Impl::type_list_remove_first::type; + using new_memory_traits = + Kokkos::MemoryTraits; + using new_type_list = typename Kokkos::Impl::concat_type_list< + type_list_in_wo_memory_traits, + Kokkos::Impl::type_list>::type; + + public: + using type = typename TypeListToViewTraits::type; +}; +} // namespace Impl template -KOKKOS_INLINE_FUNCTION - typename Kokkos::Impl::ViewMapping, Args...>::type - subview(const View& src, Args... args) { +KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { static_assert(View::rank == sizeof...(Args), "subview requires one argument for each source View rank"); return typename Kokkos::Impl::ViewMapping< void /* deduce subview type from source view traits */ , - ViewTraits, Args...>::type(src, args...); + typename Impl::RemoveAlignedMemoryTrait::type, + Args...>::type(src, args...); } template -KOKKOS_INLINE_FUNCTION typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - ViewTraits, Args...>::template apply::type -subview(const View& src, Args... args) { +KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { static_assert(View::rank == sizeof...(Args), "subview requires one argument for each source View rank"); + static_assert(Kokkos::is_memory_traits::value); return typename Kokkos::Impl::ViewMapping< void /* deduce subview type from source view traits */ , - ViewTraits, - Args...>::template apply::type(src, args...); + typename Impl::RemoveAlignedMemoryTrait::type, + Args...>::type(src, args...); } +template +using Subview = decltype(subview(std::declval(), std::declval()...)); + } /* namespace Kokkos */ //---------------------------------------------------------------------------- diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 9dc83437e9..fb590820d7 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -2712,14 +2712,8 @@ struct ViewDataHandle< Traits, std::enable_if_t<(std::is_void::value && (!Traits::memory_traits::is_aligned) && - Traits::memory_traits::is_restrict -#ifdef KOKKOS_ENABLE_CUDA - && (!(std::is_same::value || - std::is_same::value)) -#endif - && (!Traits::memory_traits::is_atomic))>> { + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; using handle_type = typename Traits::value_type* KOKKOS_RESTRICT; using return_type = typename Traits::value_type& KOKKOS_RESTRICT; @@ -2742,14 +2736,8 @@ struct ViewDataHandle< Traits, std::enable_if_t<(std::is_void::value && Traits::memory_traits::is_aligned && - (!Traits::memory_traits::is_restrict) -#ifdef KOKKOS_ENABLE_CUDA - && (!(std::is_same::value || - std::is_same::value)) -#endif - && (!Traits::memory_traits::is_atomic))>> { + (!Traits::memory_traits::is_restrict) && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2787,14 +2775,8 @@ struct ViewDataHandle< Traits, std::enable_if_t<(std::is_void::value && Traits::memory_traits::is_aligned && - Traits::memory_traits::is_restrict -#ifdef KOKKOS_ENABLE_CUDA - && (!(std::is_same::value || - std::is_same::value)) -#endif - && (!Traits::memory_traits::is_atomic))>> { + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration diff --git a/core/unit_test/TestViewSubview.hpp b/core/unit_test/TestViewSubview.hpp index 54ac0036de..b36290a325 100644 --- a/core/unit_test/TestViewSubview.hpp +++ b/core/unit_test/TestViewSubview.hpp @@ -2120,6 +2120,13 @@ struct TestSubviewMemoryTraitsConstruction { std::pair range(3, 5); auto sv = Kokkos::subview(v, range); + if constexpr (memory_traits_type::is_aligned) + static_assert(decltype(sv)::memory_traits::value + Kokkos::Aligned == + memory_traits_type::value); + else + static_assert(decltype(sv)::memory_traits::value == + memory_traits_type::value); + ASSERT_EQ(2u, sv.size()); EXPECT_EQ(3., sv[0]); EXPECT_EQ(4., sv[1]); @@ -2132,6 +2139,7 @@ inline void test_subview_memory_traits_construction() { // RandomAccess (2) // Atomic (4) // Restricted (8) + // Aligned (16) TestSubviewMemoryTraitsConstruction<0>()(); TestSubviewMemoryTraitsConstruction<1>()(); TestSubviewMemoryTraitsConstruction<2>()(); @@ -2148,6 +2156,22 @@ inline void test_subview_memory_traits_construction() { TestSubviewMemoryTraitsConstruction<13>()(); TestSubviewMemoryTraitsConstruction<14>()(); TestSubviewMemoryTraitsConstruction<15>()(); + TestSubviewMemoryTraitsConstruction<16>()(); + TestSubviewMemoryTraitsConstruction<17>()(); + TestSubviewMemoryTraitsConstruction<18>()(); + TestSubviewMemoryTraitsConstruction<19>()(); + TestSubviewMemoryTraitsConstruction<20>()(); + TestSubviewMemoryTraitsConstruction<21>()(); + TestSubviewMemoryTraitsConstruction<22>()(); + TestSubviewMemoryTraitsConstruction<23>()(); + TestSubviewMemoryTraitsConstruction<24>()(); + TestSubviewMemoryTraitsConstruction<25>()(); + TestSubviewMemoryTraitsConstruction<26>()(); + TestSubviewMemoryTraitsConstruction<27>()(); + TestSubviewMemoryTraitsConstruction<28>()(); + TestSubviewMemoryTraitsConstruction<29>()(); + TestSubviewMemoryTraitsConstruction<30>()(); + TestSubviewMemoryTraitsConstruction<31>()(); } //---------------------------------------------------------------------------- From 6e36acf3827c0cc6d26c6bfdc39bb30e23b29791 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 15 Feb 2023 11:50:24 -0500 Subject: [PATCH 222/496] Add comment in test --- core/unit_test/TestViewSubview.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/unit_test/TestViewSubview.hpp b/core/unit_test/TestViewSubview.hpp index b36290a325..165fe85b38 100644 --- a/core/unit_test/TestViewSubview.hpp +++ b/core/unit_test/TestViewSubview.hpp @@ -2120,6 +2120,8 @@ struct TestSubviewMemoryTraitsConstruction { std::pair range(3, 5); auto sv = Kokkos::subview(v, range); + // check that the subview memory traits are the same as the original view + // (with the Aligned trait stripped). if constexpr (memory_traits_type::is_aligned) static_assert(decltype(sv)::memory_traits::value + Kokkos::Aligned == memory_traits_type::value); From 0b943434d0e32637ccc089487618fe32efd37f1a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 15 Feb 2023 15:36:32 -0500 Subject: [PATCH 223/496] MemoryTraits::value -> MemoryTraits::impl_value --- core/src/Kokkos_MemoryTraits.hpp | 2 +- core/src/Kokkos_View.hpp | 2 +- core/unit_test/TestViewSubview.hpp | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/Kokkos_MemoryTraits.hpp b/core/src/Kokkos_MemoryTraits.hpp index fc5fc971b6..c145d04a42 100644 --- a/core/src/Kokkos_MemoryTraits.hpp +++ b/core/src/Kokkos_MemoryTraits.hpp @@ -49,7 +49,7 @@ struct MemoryTraits { //! Tag this class as a kokkos memory traits: using memory_traits = MemoryTraits; - static constexpr unsigned value = T; + static constexpr unsigned impl_value = T; static constexpr bool is_unmanaged = (unsigned(0) != (T & unsigned(Kokkos::Unmanaged))); diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 3a7f2237b8..6af892e72a 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -1766,7 +1766,7 @@ struct RemoveAlignedMemoryTrait { typename Kokkos::Impl::type_list_remove_first::type; using new_memory_traits = - Kokkos::MemoryTraits; + Kokkos::MemoryTraits; using new_type_list = typename Kokkos::Impl::concat_type_list< type_list_in_wo_memory_traits, Kokkos::Impl::type_list>::type; diff --git a/core/unit_test/TestViewSubview.hpp b/core/unit_test/TestViewSubview.hpp index 165fe85b38..cd3031b0b5 100644 --- a/core/unit_test/TestViewSubview.hpp +++ b/core/unit_test/TestViewSubview.hpp @@ -2123,11 +2123,11 @@ struct TestSubviewMemoryTraitsConstruction { // check that the subview memory traits are the same as the original view // (with the Aligned trait stripped). if constexpr (memory_traits_type::is_aligned) - static_assert(decltype(sv)::memory_traits::value + Kokkos::Aligned == - memory_traits_type::value); + static_assert(decltype(sv)::memory_traits::impl_value + Kokkos::Aligned == + memory_traits_type::impl_value); else - static_assert(decltype(sv)::memory_traits::value == - memory_traits_type::value); + static_assert(decltype(sv)::memory_traits::impl_value == + memory_traits_type::impl_value); ASSERT_EQ(2u, sv.size()); EXPECT_EQ(3., sv[0]); From 86bbae342c321ad6e44f1c098ac82459bb37be62 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 15 Feb 2023 17:26:04 -0500 Subject: [PATCH 224/496] Deprecate subview overload taking a template argument for MemoryTraits --- core/src/Kokkos_View.hpp | 5 ++++- core/unit_test/TestViewMapping_subview.hpp | 3 +-- core/unit_test/TestViewSubview.hpp | 12 ++++++++---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 6af892e72a..7006e1d445 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -1788,8 +1788,10 @@ KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { Args...>::type(src, args...); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template -KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { +KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, + Args... args) { static_assert(View::rank == sizeof...(Args), "subview requires one argument for each source View rank"); static_assert(Kokkos::is_memory_traits::value); @@ -1800,6 +1802,7 @@ KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { typename Impl::RemoveAlignedMemoryTrait::type, Args...>::type(src, args...); } +#endif template using Subview = decltype(subview(std::declval(), std::declval()...)); diff --git a/core/unit_test/TestViewMapping_subview.hpp b/core/unit_test/TestViewMapping_subview.hpp index 069ad09da9..888abf4ca8 100644 --- a/core/unit_test/TestViewMapping_subview.hpp +++ b/core/unit_test/TestViewMapping_subview.hpp @@ -100,8 +100,7 @@ struct TestViewMappingSubview { KOKKOS_INLINE_FUNCTION void operator()(const int, long& error_count) const { - auto Ad = Kokkos::subview( - Aa, Kokkos::pair(1, AN - 1)); + auto Ad = Kokkos::subview(Aa, Kokkos::pair(1, AN - 1)); for (int i = 1; i < AN - 1; ++i) if (&Aa[i] != &Ab[i - 1]) ++error_count; diff --git a/core/unit_test/TestViewSubview.hpp b/core/unit_test/TestViewSubview.hpp index cd3031b0b5..a3757930e7 100644 --- a/core/unit_test/TestViewSubview.hpp +++ b/core/unit_test/TestViewSubview.hpp @@ -2110,18 +2110,22 @@ void test_unmanaged_subview_reset() { template MTF> struct TestSubviewMemoryTraitsConstruction { void operator()() const noexcept { - using view_type = Kokkos::View; - using size_type = view_type::size_type; using memory_traits_type = Kokkos::MemoryTraits; + using view_type = + Kokkos::View; + using size_type = typename view_type::size_type; - view_type v("v", 7); + Kokkos::View v_original("v", 7); + view_type v(v_original.data(), v_original.size()); for (size_type i = 0; i != v.size(); ++i) v[i] = static_cast(i); std::pair range(3, 5); - auto sv = Kokkos::subview(v, range); + auto sv = Kokkos::subview(v, range); // check that the subview memory traits are the same as the original view // (with the Aligned trait stripped). + static_assert(decltype(v)::memory_traits::impl_value == + memory_traits_type::impl_value); if constexpr (memory_traits_type::is_aligned) static_assert(decltype(sv)::memory_traits::impl_value + Kokkos::Aligned == memory_traits_type::impl_value); From c4b81ec486cac144f5a783fa973e266e9ba8e370 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 16 Feb 2023 08:52:31 -0500 Subject: [PATCH 225/496] Try fixing Cuda 11 CI --- core/unit_test/TestViewSubview.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/core/unit_test/TestViewSubview.hpp b/core/unit_test/TestViewSubview.hpp index a3757930e7..36c0863583 100644 --- a/core/unit_test/TestViewSubview.hpp +++ b/core/unit_test/TestViewSubview.hpp @@ -2124,13 +2124,15 @@ struct TestSubviewMemoryTraitsConstruction { // check that the subview memory traits are the same as the original view // (with the Aligned trait stripped). - static_assert(decltype(v)::memory_traits::impl_value == + using view_memory_traits = typename decltype(v)::memory_traits; + using subview_memory_traits = typename decltype(sv)::memory_traits; + static_assert(view_memory_traits::impl_value == memory_traits_type::impl_value); if constexpr (memory_traits_type::is_aligned) - static_assert(decltype(sv)::memory_traits::impl_value + Kokkos::Aligned == + static_assert(subview_memory_traits::impl_value + Kokkos::Aligned == memory_traits_type::impl_value); else - static_assert(decltype(sv)::memory_traits::impl_value == + static_assert(subview_memory_traits::impl_value == memory_traits_type::impl_value); ASSERT_EQ(2u, sv.size()); From 4ca0340796846257dc2c9ee9c0ea78f89ca59561 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 16 Feb 2023 12:48:23 -0500 Subject: [PATCH 226/496] Add comment in test --- core/unit_test/TestViewSubview.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/unit_test/TestViewSubview.hpp b/core/unit_test/TestViewSubview.hpp index 36c0863583..386887d923 100644 --- a/core/unit_test/TestViewSubview.hpp +++ b/core/unit_test/TestViewSubview.hpp @@ -2115,6 +2115,9 @@ struct TestSubviewMemoryTraitsConstruction { Kokkos::View; using size_type = typename view_type::size_type; + // Create a managed View first and then apply the desired memory traits to + // an unmanaged version of it since a managed View can't use the Unmanaged + // trait. Kokkos::View v_original("v", 7); view_type v(v_original.data(), v_original.size()); for (size_type i = 0; i != v.size(); ++i) v[i] = static_cast(i); From 60ba1e1eeb921455dd3a6926d8dce188ff55362a Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 17 Feb 2023 12:13:46 +0100 Subject: [PATCH 227/496] Add one more digit for KOKKOS_COMPILER_NVHPC version components --- core/src/Kokkos_Macros.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 7e9e23b64f..4f5bda88ed 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -163,8 +163,8 @@ #endif #if defined(__NVCOMPILER) -#define KOKKOS_COMPILER_NVHPC \ - __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ * 10 + \ +#define KOKKOS_COMPILER_NVHPC \ + __NVCOMPILER_MAJOR__ * 10000 + __NVCOMPILER_MINOR__ * 100 + \ __NVCOMPILER_PATCHLEVEL__ #endif From d3eac2b1667d7884790a4560f0f0e204d024ce2f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 16 Feb 2023 17:11:12 -0500 Subject: [PATCH 228/496] Cleanup prefer {traits:: -> }rank[_dynamic] --- core/src/Kokkos_View.hpp | 199 +++++++++++++++++++-------------------- 1 file changed, 96 insertions(+), 103 deletions(-) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index e576c8d42a..8d2f421693 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -814,14 +814,14 @@ class View : public ViewTraits { template static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(traits::rank <= sizeof...(Is), ""); + static_assert(rank <= sizeof...(Is), ""); static_assert(sizeof...(Is) <= 8, ""); static_assert(Kokkos::Impl::are_integral::value, ""); } template static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(traits::rank == sizeof...(Is), ""); + static_assert(rank == sizeof...(Is), ""); static_assert(Kokkos::Impl::are_integral::value, ""); } @@ -830,22 +830,22 @@ class View : public ViewTraits { // Rank 1 default map operator() template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (1 == traits::rank) && is_default_map && !is_layout_stride), - reference_type> - operator()(I0 i0) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + operator()(I0 i0) const { check_operator_parens_valid_args(i0); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) return m_map.m_impl_handle[i0]; } template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (1 == traits::rank) && is_default_map && is_layout_stride), - reference_type> - operator()(I0 i0) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + operator()(I0 i0) const { check_operator_parens_valid_args(i0); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; @@ -856,8 +856,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - ((1 == traits::rank) && Kokkos::Impl::are_integral::value && - !is_default_map), + ((1 == rank) && Kokkos::Impl::are_integral::value && !is_default_map), reference_type> operator[](I0 i0) const { KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) @@ -865,21 +864,21 @@ class View : public ViewTraits { } template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - ((1 == traits::rank) && Kokkos::Impl::are_integral::value && - is_default_map && !is_layout_stride), - reference_type> - operator[](I0 i0) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && + is_default_map && !is_layout_stride), + reference_type> + operator[](I0 i0) const { KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) return m_map.m_impl_handle[i0]; } template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - ((1 == traits::rank) && Kokkos::Impl::are_integral::value && - is_default_map && is_layout_stride), - reference_type> - operator[](I0 i0) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && + is_default_map && is_layout_stride), + reference_type> + operator[](I0 i0) const { KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; } @@ -888,59 +887,55 @@ class View : public ViewTraits { // Rank 2 default map operator() template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (2 == traits::rank) && is_default_map && - is_layout_left && (traits::rank_dynamic == 0)), - reference_type> - operator()(I0 i0, I1 i1) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 == rank) && is_default_map && is_layout_left && (rank_dynamic == 0)), + reference_type> + operator()(I0 i0, I1 i1) const { check_operator_parens_valid_args(i0, i1); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (2 == traits::rank) && is_default_map && - is_layout_left && (traits::rank_dynamic != 0)), - reference_type> - operator()(I0 i0, I1 i1) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 == rank) && is_default_map && is_layout_left && (rank_dynamic != 0)), + reference_type> + operator()(I0 i0, I1 i1) const { check_operator_parens_valid_args(i0, i1); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (2 == traits::rank) && is_default_map && - is_layout_right && (traits::rank_dynamic == 0)), - reference_type> - operator()(I0 i0, I1 i1) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 == rank) && is_default_map && is_layout_right && (rank_dynamic == 0)), + reference_type> + operator()(I0 i0, I1 i1) const { check_operator_parens_valid_args(i0, i1); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (2 == traits::rank) && is_default_map && - is_layout_right && (traits::rank_dynamic != 0)), - reference_type> - operator()(I0 i0, I1 i1) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 == rank) && is_default_map && is_layout_right && (rank_dynamic != 0)), + reference_type> + operator()(I0 i0, I1 i1) const { check_operator_parens_valid_args(i0, i1); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; } template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 == traits::rank) && is_default_map && is_layout_stride), - reference_type> - operator()(I0 i0, I1 i1) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (2 == rank) && is_default_map && is_layout_stride), + reference_type> + operator()(I0 i0, I1 i1) const { check_operator_parens_valid_args(i0, i1); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + @@ -951,12 +946,11 @@ class View : public ViewTraits { // have "inlined" versions above template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (2 != traits::rank) && (1 != traits::rank) && - (0 != traits::rank) && is_default_map), - reference_type> - operator()(Is... indices) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), + reference_type> + operator()(Is... indices) const { check_operator_parens_valid_args(indices...); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; @@ -965,7 +959,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && // - ((0 == traits::rank) || !is_default_map)), + ((0 == rank) || !is_default_map)), reference_type> operator()(Is... indices) const { check_operator_parens_valid_args(indices...); @@ -978,8 +972,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (0 == traits::rank)), - reference_type> + (Kokkos::Impl::always_true::value && (0 == rank)), reference_type> access(Is... extra) const { check_access_member_function_valid_args(extra...); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) @@ -992,7 +985,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == traits::rank) && !is_default_map), + (1 == rank) && !is_default_map), reference_type> access(I0 i0, Is... extra) const { check_access_member_function_valid_args(i0, extra...); @@ -1001,22 +994,22 @@ class View : public ViewTraits { } template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (1 == traits::rank) && - is_default_map && !is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { check_access_member_function_valid_args(i0, extra...); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) return m_map.m_impl_handle[i0]; } template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (1 == traits::rank) && - is_default_map && is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { check_access_member_function_valid_args(i0, extra...); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; @@ -1028,7 +1021,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && - (2 == traits::rank) && !is_default_map), + (2 == rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, Is... extra) const { check_access_member_function_valid_args(i0, i1, extra...); @@ -1038,8 +1031,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == traits::rank) && - is_default_map && is_layout_left && (traits::rank_dynamic == 0)), + (Kokkos::Impl::always_true::value && (2 == rank) && + is_default_map && is_layout_left && (rank_dynamic == 0)), reference_type> access(I0 i0, I1 i1, Is... extra) const { check_access_member_function_valid_args(i0, i1, extra...); @@ -1049,8 +1042,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == traits::rank) && - is_default_map && is_layout_left && (traits::rank_dynamic != 0)), + (Kokkos::Impl::always_true::value && (2 == rank) && + is_default_map && is_layout_left && (rank_dynamic != 0)), reference_type> access(I0 i0, I1 i1, Is... extra) const { check_access_member_function_valid_args(i0, i1, extra...); @@ -1060,8 +1053,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == traits::rank) && - is_default_map && is_layout_right && (traits::rank_dynamic == 0)), + (Kokkos::Impl::always_true::value && (2 == rank) && + is_default_map && is_layout_right && (rank_dynamic == 0)), reference_type> access(I0 i0, I1 i1, Is... extra) const { check_access_member_function_valid_args(i0, i1, extra...); @@ -1071,8 +1064,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == traits::rank) && - is_default_map && is_layout_right && (traits::rank_dynamic != 0)), + (Kokkos::Impl::always_true::value && (2 == rank) && + is_default_map && is_layout_right && (rank_dynamic != 0)), reference_type> access(I0 i0, I1 i1, Is... extra) const { check_access_member_function_valid_args(i0, i1, extra...); @@ -1081,11 +1074,11 @@ class View : public ViewTraits { } template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == traits::rank) && - is_default_map && is_layout_stride), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (2 == rank) && is_default_map && is_layout_stride), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { check_access_member_function_valid_args(i0, i1, extra...); KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + @@ -1098,7 +1091,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == traits::rank) && is_default_map), + (3 == rank) && is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, extra...); @@ -1109,7 +1102,7 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == traits::rank) && !is_default_map), + (3 == rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, extra...); @@ -1122,8 +1115,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (4 == traits::rank) && is_default_map), + (Kokkos::Impl::always_true::value && (4 == rank) && + is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, extra...); @@ -1133,8 +1126,8 @@ class View : public ViewTraits { template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (4 == traits::rank) && !is_default_map), + (Kokkos::Impl::always_true::value && (4 == rank) && + !is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, extra...); @@ -1149,7 +1142,7 @@ class View : public ViewTraits { typename... Is> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && - (5 == traits::rank) && is_default_map), + (5 == rank) && is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); @@ -1162,7 +1155,7 @@ class View : public ViewTraits { typename... Is> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && - (5 == traits::rank) && !is_default_map), + (5 == rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); @@ -1178,7 +1171,7 @@ class View : public ViewTraits { typename I5, typename... Is> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && - (6 == traits::rank) && is_default_map), + (6 == rank) && is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); @@ -1191,7 +1184,7 @@ class View : public ViewTraits { typename I5, typename... Is> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && - (6 == traits::rank) && !is_default_map), + (6 == rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); @@ -1207,7 +1200,7 @@ class View : public ViewTraits { typename I5, typename I6, typename... Is> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && - (7 == traits::rank) && is_default_map), + (7 == rank) && is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, @@ -1221,7 +1214,7 @@ class View : public ViewTraits { typename I5, typename I6, typename... Is> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< (Kokkos::Impl::always_true::value && - (7 == traits::rank) && !is_default_map), + (7 == rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, @@ -1239,7 +1232,7 @@ class View : public ViewTraits { KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == traits::rank) && is_default_map), + (8 == rank) && is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, Is... extra) const { @@ -1256,7 +1249,7 @@ class View : public ViewTraits { KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == traits::rank) && !is_default_map), + (8 == rank) && !is_default_map), reference_type> access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, Is... extra) const { @@ -1421,7 +1414,7 @@ class View : public ViewTraits { const std::string& alloc_name = Impl::get_property(prop_copy); Impl::runtime_check_rank( - traits::rank, traits::rank_dynamic, + rank, rank_dynamic, std::is_same::value, i0, i1, i2, i3, i4, i5, i6, i7, alloc_name); @@ -1637,7 +1630,7 @@ class View : public ViewTraits { arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); if (std::is_void::value && - num_passed_args != traits::rank_dynamic) { + num_passed_args != rank_dynamic) { Kokkos::abort( "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); } From 2969679b7819c78c79b3faac1328c38ba4be36ce Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 16 Feb 2023 18:36:27 -0500 Subject: [PATCH 229/496] Fix MSVC CI build Co-Authored-By: Christian Trott --- algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp | 2 +- core/src/Kokkos_View.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 52e7625e4d..0376100410 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -29,7 +29,7 @@ struct is_admissible_to_kokkos_std_algorithms : std::false_type {}; template struct is_admissible_to_kokkos_std_algorithms< - T, std::enable_if_t< ::Kokkos::is_view::value && T::rank == 1 && + T, std::enable_if_t< ::Kokkos::is_view::value && T::rank() == 1 && (std::is_same::value || std::is_same { template KOKKOS_FUNCTION std::enable_if_t< - N == View::rank && + N == View::rank() && std::is_same::specialize, void>::value, View> as_view_of_rank_n(View v) { @@ -1716,7 +1716,7 @@ as_view_of_rank_n(View v) { // never be called template KOKKOS_FUNCTION std::enable_if_t< - N != View::rank && + N != View::rank() && std::is_same::specialize, void>::value, View::value_type, N>::type, Args...>> From 25ff05beece8bcfa1d741821e044a6220e54cdb2 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 17 Feb 2023 13:32:47 -0500 Subject: [PATCH 230/496] Fix warning pointless comparison of unsigned integer with zero --- .../default/TestDefaultDeviceTypeViewAPI.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp b/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp index f179fe85ed..d81c71499f 100644 --- a/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp +++ b/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp @@ -113,11 +113,17 @@ TYPED_TEST(TestViewAPI, sizes) { static_assert(view_t::rank == TestFixture::expected_rank, "TestViewAPI: Error: rank mismatch"); size_t expected_span = 1; - for (size_t r = 0; r < view_t::rank; r++) expected_span *= this->all_sizes[r]; + // avoid pointless comparison of unsigned integer with zero warning + if constexpr (view_t::rank > 0) { + for (size_t r = 0; r < view_t::rank; r++) + expected_span *= this->all_sizes[r]; + } EXPECT_EQ(expected_span, a.span()); - for (size_t r = 0; r < view_t::rank; r++) { - EXPECT_EQ(this->all_sizes[r], a.extent(r)); - EXPECT_EQ(this->all_sizes[r], size_t(a.extent_int(r))); + if constexpr (view_t::rank > 0) { + for (size_t r = 0; r < view_t::rank; r++) { + EXPECT_EQ(this->all_sizes[r], a.extent(r)); + EXPECT_EQ(this->all_sizes[r], size_t(a.extent_int(r))); + } } } From 3be7ae202398a5aa98d0cac80f904f348ab876b7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Sat, 18 Feb 2023 21:09:20 -0500 Subject: [PATCH 231/496] Add compile-only test for View::rank[_dynamic] --- core/unit_test/CMakeLists.txt | 1 + core/unit_test/TestViewRank.cpp | 63 +++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 core/unit_test/TestViewRank.cpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index ec748cfb03..6807c843c3 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -78,6 +78,7 @@ SET(COMPILE_ONLY_SOURCES TestInterOp.cpp TestStringManipulation.cpp TestVersionMacros.cpp + TestViewRank.cpp TestViewTypeTraits.cpp TestTypeList.cpp view/TestExtentsDatatypeConversion.cpp diff --git a/core/unit_test/TestViewRank.cpp b/core/unit_test/TestViewRank.cpp new file mode 100644 index 0000000000..7ea11afca3 --- /dev/null +++ b/core/unit_test/TestViewRank.cpp @@ -0,0 +1,63 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +template +constexpr bool test_view_rank_and_dynamic_rank() { + static_assert(View::rank == Rank); + static_assert(View::rank() == Rank); + static_assert(View::rank_dynamic == RankDynamic); + static_assert(View::rank_dynamic() == RankDynamic); + static_assert(std::is_convertible_v); + static_assert(std::is_same_v); + static_assert(std::is_convertible_v); + static_assert(std::is_same_v); + auto rank = View::rank; // not an integral type in contrast to Kokkos version + // less than 4.0.01 + static_assert(!std::is_integral_v); + auto rank_preferred = View::rank(); // since 4.0.01 + static_assert(std::is_same_v); + (void)rank; + (void)rank_preferred; + return true; +} + +// clang-format off +static_assert(test_view_rank_and_dynamic_rank, 0, 0>()); + +static_assert(test_view_rank_and_dynamic_rank, 1, 0>()); +static_assert(test_view_rank_and_dynamic_rank, 1, 1>()); + +static_assert(test_view_rank_and_dynamic_rank, 2, 0>()); +static_assert(test_view_rank_and_dynamic_rank, 2, 1>()); +static_assert(test_view_rank_and_dynamic_rank, 2, 2>()); + +static_assert(test_view_rank_and_dynamic_rank, 3, 0>()); +static_assert(test_view_rank_and_dynamic_rank, 3, 1>()); +static_assert(test_view_rank_and_dynamic_rank, 3, 2>()); +static_assert(test_view_rank_and_dynamic_rank, 3, 3>()); + +static_assert(test_view_rank_and_dynamic_rank, 4, 0>()); +static_assert(test_view_rank_and_dynamic_rank, 4, 1>()); +static_assert(test_view_rank_and_dynamic_rank, 4, 2>()); +static_assert(test_view_rank_and_dynamic_rank, 4, 3>()); +static_assert(test_view_rank_and_dynamic_rank, 4, 4>()); +//clang-format on + +} // namespace From b3a8182ae5c8bae402002c7f79323447fdd6d219 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 20 Feb 2023 22:17:13 -0500 Subject: [PATCH 232/496] Backport function templates from standard library header --- core/src/Kokkos_BitManipulation.hpp | 158 ++++++++++++ core/src/Kokkos_Core.hpp | 1 + core/unit_test/CMakeLists.txt | 1 + core/unit_test/TestBitManipulation.cpp | 333 +++++++++++++++++++++++++ 4 files changed, 493 insertions(+) create mode 100644 core/src/Kokkos_BitManipulation.hpp create mode 100644 core/unit_test/TestBitManipulation.cpp diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp new file mode 100644 index 0000000000..66088922a5 --- /dev/null +++ b/core/src/Kokkos_BitManipulation.hpp @@ -0,0 +1,158 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_BIT_MANIPULATION_HPP +#define KOKKOS_BIT_MANIPULATION_HPP + +#include +#include + +namespace Kokkos::Impl { + +template +KOKKOS_FUNCTION constexpr int countl_zero_fallback(T x) { + // From Hacker's Delight (2nd edition) section 5-3 + unsigned int y = 0; + using ::Kokkos::Experimental::digits_v; + int n = digits_v; + int c = digits_v / 2; + do { + y = x >> c; + if (y != 0) { + n -= c; + x = y; + } + c >>= 1; + } while (c != 0); + return n - static_cast(x); +} + +template +KOKKOS_FUNCTION constexpr int countr_zero_fallback(T x) { + using ::Kokkos::Experimental::digits_v; + return digits_v - countl_zero_fallback(static_cast( + static_cast(~x) & static_cast(x - 1))); +} + +template +KOKKOS_FUNCTION constexpr int popcount_fallback(T x) { + int c = 0; + for (; x != 0; x &= x - 1) { + ++c; + } + return c; +} + +template +inline constexpr bool is_standard_unsigned_integer_type_v = + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v; + +} // namespace Kokkos::Impl + +namespace Kokkos { + +// +template +KOKKOS_FUNCTION constexpr std::enable_if_t< + Impl::is_standard_unsigned_integer_type_v, int> +countl_zero(T x) noexcept { + using ::Kokkos::Experimental::digits_v; + if (x == 0) return digits_v; + // TODO use compiler intrinsics when available + return Impl::countl_zero_fallback(x); +} + +template +KOKKOS_FUNCTION constexpr std::enable_if_t< + Impl::is_standard_unsigned_integer_type_v, int> +countl_one(T x) noexcept { + using ::Kokkos::Experimental::digits_v; + using ::Kokkos::Experimental::finite_max_v; + if (x == finite_max_v) return digits_v; + return countl_zero(static_cast(~x)); +} + +template +KOKKOS_FUNCTION constexpr std::enable_if_t< + Impl::is_standard_unsigned_integer_type_v, int> +countr_zero(T x) noexcept { + using ::Kokkos::Experimental::digits_v; + if (x == 0) return digits_v; + // TODO use compiler intrinsics when available + return Impl::countr_zero_fallback(x); +} + +template +KOKKOS_FUNCTION constexpr std::enable_if_t< + Impl::is_standard_unsigned_integer_type_v, int> +countr_one(T x) noexcept { + using ::Kokkos::Experimental::digits_v; + using ::Kokkos::Experimental::finite_max_v; + if (x == finite_max_v) return digits_v; + return countr_zero(static_cast(~x)); +} + +template +KOKKOS_FUNCTION constexpr std::enable_if_t< + Impl::is_standard_unsigned_integer_type_v, int> +popcount(T x) noexcept { + if (x == 0) return 0; + // TODO use compiler intrinsics when available + return Impl::popcount_fallback(x); +} +// + +// +template +KOKKOS_FUNCTION constexpr std::enable_if_t< + Impl::is_standard_unsigned_integer_type_v, bool> +has_single_bit(T x) noexcept { + return x != 0 && (((x & (x - 1)) == 0)); +} + +template +KOKKOS_FUNCTION constexpr std::enable_if_t< + Impl::is_standard_unsigned_integer_type_v, T> +bit_ceil(T x) noexcept { + if (x <= 1) return 1; + using ::Kokkos::Experimental::digits_v; + return T{1} << (digits_v - countl_zero(static_cast(x - 1))); +} + +template +KOKKOS_FUNCTION constexpr std::enable_if_t< + Impl::is_standard_unsigned_integer_type_v, T> +bit_floor(T x) noexcept { + if (x == 0) return 0; + using ::Kokkos::Experimental::digits_v; + return T{1} << (digits_v - 1 - countl_zero(x)); +} + +template +KOKKOS_FUNCTION constexpr std::enable_if_t< + Impl::is_standard_unsigned_integer_type_v, T> +bit_width(T x) noexcept { + if (x == 0) return 0; + using ::Kokkos::Experimental::digits_v; + return digits_v - countl_zero(x); +} +// + +} // namespace Kokkos + +#endif diff --git a/core/src/Kokkos_Core.hpp b/core/src/Kokkos_Core.hpp index cf898a71e7..86ed0ff9ea 100644 --- a/core/src/Kokkos_Core.hpp +++ b/core/src/Kokkos_Core.hpp @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index ec748cfb03..875a12df1b 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -75,6 +75,7 @@ SET(COMPILE_ONLY_SOURCES TestArray.cpp TestCreateMirror.cpp TestDetectionIdiom.cpp + TestBitManipulation.cpp TestInterOp.cpp TestStringManipulation.cpp TestVersionMacros.cpp diff --git a/core/unit_test/TestBitManipulation.cpp b/core/unit_test/TestBitManipulation.cpp new file mode 100644 index 0000000000..f0baeef35e --- /dev/null +++ b/core/unit_test/TestBitManipulation.cpp @@ -0,0 +1,333 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +struct X { + constexpr bool did_not_match() { return true; } +}; + +#define TEST_BIT_MANIPULATION(FUNC) \ + constexpr X test_##FUNC(...) { return {}; } \ + static_assert(test_##FUNC((unsigned char)0)); \ + static_assert(test_##FUNC((unsigned short)0)); \ + static_assert(test_##FUNC((unsigned int)0)); \ + static_assert(test_##FUNC((unsigned long)0)); \ + static_assert(test_##FUNC((unsigned long long)0)); \ + static_assert(test_##FUNC((bool)0).did_not_match()); \ + static_assert(test_##FUNC((int)0).did_not_match()); \ + static_assert(test_##FUNC((float)0).did_not_match()); \ + static_assert(test_##FUNC((void*)0).did_not_match()) + +// +template +constexpr auto test_countl_zero(UInt x) -> decltype(Kokkos::countl_zero(x)) { + using Kokkos::countl_zero; + + static_assert(noexcept(countl_zero(x))); + static_assert(std::is_same_v); + + constexpr auto dig = Kokkos::Experimental::digits_v; + constexpr auto max = Kokkos::Experimental::finite_max_v; + + static_assert(countl_zero(UInt(0)) == dig); + static_assert(countl_zero(UInt(1)) == dig - 1); + static_assert(countl_zero(UInt(2)) == dig - 2); + static_assert(countl_zero(UInt(3)) == dig - 2); + static_assert(countl_zero(UInt(4)) == dig - 3); + static_assert(countl_zero(UInt(5)) == dig - 3); + static_assert(countl_zero(UInt(6)) == dig - 3); + static_assert(countl_zero(UInt(7)) == dig - 3); + static_assert(countl_zero(UInt(8)) == dig - 4); + static_assert(countl_zero(UInt(9)) == dig - 4); + static_assert(countl_zero(UInt(127)) == dig - 7); + static_assert(countl_zero(UInt(128)) == dig - 8); + static_assert(countl_zero(max) == 0); + + return true; +} + +TEST_BIT_MANIPULATION(countl_zero); + +template +constexpr auto test_countl_one(UInt x) -> decltype(Kokkos::countl_one(x)) { + using Kokkos::countl_one; + + static_assert(noexcept(countl_one(x))); + static_assert(std::is_same_v); + + constexpr auto dig = Kokkos::Experimental::digits_v; + constexpr auto max = Kokkos::Experimental::finite_max_v; + + static_assert(countl_one(UInt(0)) == 0); + static_assert(countl_one(UInt(1)) == 0); + static_assert(countl_one(UInt(2)) == 0); + static_assert(countl_one(UInt(3)) == 0); + static_assert(countl_one(UInt(4)) == 0); + static_assert(countl_one(UInt(5)) == 0); + static_assert(countl_one(UInt(6)) == 0); + static_assert(countl_one(UInt(7)) == 0); + static_assert(countl_one(UInt(8)) == 0); + static_assert(countl_one(UInt(9)) == 0); + static_assert(countl_one(UInt(100)) == 0); + static_assert(countl_one(max) == dig); + static_assert(countl_one(UInt(max - 1)) == dig - 1); + static_assert(countl_one(UInt(max - 2)) == dig - 2); + static_assert(countl_one(UInt(max - 3)) == dig - 2); + static_assert(countl_one(UInt(max - 4)) == dig - 3); + static_assert(countl_one(UInt(max - 5)) == dig - 3); + static_assert(countl_one(UInt(max - 6)) == dig - 3); + static_assert(countl_one(UInt(max - 7)) == dig - 3); + static_assert(countl_one(UInt(max - 8)) == dig - 4); + static_assert(countl_one(UInt(max - 9)) == dig - 4); + static_assert(countl_one(UInt(max - 126)) == dig - 7); + static_assert(countl_one(UInt(max - 127)) == dig - 7); + static_assert(countl_one(UInt(max - 128)) == dig - 8); + static_assert(countl_one(UInt(UInt(1) << (dig - 1))) == 1); + static_assert(countl_one(UInt(UInt(3) << (dig - 2))) == 2); + static_assert(countl_one(UInt(UInt(7) << (dig - 3))) == 3); + static_assert(countl_one(UInt(UInt(255) << (dig - 8))) == 8); + + return true; +} + +TEST_BIT_MANIPULATION(countl_one); + +template +constexpr auto test_countr_zero(UInt x) -> decltype(Kokkos::countr_zero(x)) { + using Kokkos::countr_zero; + + static_assert(noexcept(countr_zero(x))); + static_assert(std::is_same_v); + + constexpr auto dig = Kokkos::Experimental::digits_v; + constexpr auto max = Kokkos::Experimental::finite_max_v; + + static_assert(countr_zero(UInt(0)) == dig); + static_assert(countr_zero(UInt(1)) == 0); + static_assert(countr_zero(UInt(2)) == 1); + static_assert(countr_zero(UInt(3)) == 0); + static_assert(countr_zero(UInt(4)) == 2); + static_assert(countr_zero(UInt(5)) == 0); + static_assert(countr_zero(UInt(6)) == 1); + static_assert(countr_zero(UInt(7)) == 0); + static_assert(countr_zero(UInt(8)) == 3); + static_assert(countr_zero(UInt(9)) == 0); + static_assert(countr_zero(UInt(126)) == 1); + static_assert(countr_zero(UInt(127)) == 0); + static_assert(countr_zero(UInt(128)) == 7); + static_assert(countr_zero(UInt(129)) == 0); + static_assert(countr_zero(UInt(130)) == 1); + static_assert(countr_zero(max) == 0); + + return true; +} + +TEST_BIT_MANIPULATION(countr_zero); + +template +constexpr auto test_countr_one(UInt x) -> decltype(Kokkos::countr_one(x)) { + using Kokkos::countr_one; + + static_assert(noexcept(countr_one(x))); + static_assert(std::is_same_v); + + constexpr auto dig = Kokkos::Experimental::digits_v; + constexpr auto max = Kokkos::Experimental::finite_max_v; + + static_assert(countr_one(UInt(0)) == 0); + static_assert(countr_one(UInt(1)) == 1); + static_assert(countr_one(UInt(2)) == 0); + static_assert(countr_one(UInt(3)) == 2); + static_assert(countr_one(UInt(4)) == 0); + static_assert(countr_one(UInt(5)) == 1); + static_assert(countr_one(UInt(6)) == 0); + static_assert(countr_one(UInt(7)) == 3); + static_assert(countr_one(UInt(8)) == 0); + static_assert(countr_one(UInt(9)) == 1); + static_assert(countr_one(UInt(126)) == 0); + static_assert(countr_one(UInt(127)) == 7); + static_assert(countr_one(UInt(128)) == 0); + static_assert(countr_one(UInt(max - 1)) == 0); + static_assert(countr_one(max) == dig); + + return true; +} + +TEST_BIT_MANIPULATION(countr_one); + +template +constexpr auto test_popcount(UInt x) -> decltype(Kokkos::popcount(x)) { + using Kokkos::popcount; + + static_assert(noexcept(popcount(x))); + static_assert(std::is_same_v); + + constexpr auto dig = Kokkos::Experimental::digits_v; + constexpr auto max = Kokkos::Experimental::finite_max_v; + + static_assert(popcount(UInt(0)) == 0); + static_assert(popcount(UInt(1)) == 1); + static_assert(popcount(UInt(2)) == 1); + static_assert(popcount(UInt(3)) == 2); + static_assert(popcount(UInt(4)) == 1); + static_assert(popcount(UInt(5)) == 2); + static_assert(popcount(UInt(6)) == 2); + static_assert(popcount(UInt(7)) == 3); + static_assert(popcount(UInt(8)) == 1); + static_assert(popcount(UInt(9)) == 2); + static_assert(popcount(UInt(127)) == 7); + static_assert(popcount(max) == dig); + static_assert(popcount(UInt(max - 1)) == dig - 1); + + return true; +} + +TEST_BIT_MANIPULATION(popcount); +// + +// +template +constexpr auto test_has_single_bit(UInt x) + -> decltype(Kokkos::has_single_bit(x)) { + using Kokkos::has_single_bit; + + static_assert(noexcept(has_single_bit(x))); + static_assert(std::is_same_v); + + static_assert(!has_single_bit(UInt(0))); + static_assert(has_single_bit(UInt(1))); + static_assert(has_single_bit(UInt(2))); + static_assert(!has_single_bit(UInt(3))); + static_assert(has_single_bit(UInt(4))); + static_assert(!has_single_bit(UInt(5))); + static_assert(!has_single_bit(UInt(6))); + static_assert(!has_single_bit(UInt(7))); + static_assert(has_single_bit(UInt(8))); + static_assert(!has_single_bit(UInt(9))); + + constexpr auto max = Kokkos::Experimental::finite_max_v; + static_assert(!has_single_bit(max)); + constexpr UInt one = 1; + static_assert(has_single_bit(UInt(one << 0))); + static_assert(has_single_bit(UInt(one << 1))); + static_assert(has_single_bit(UInt(one << 2))); + static_assert(has_single_bit(UInt(one << 3))); + static_assert(has_single_bit(UInt(one << 4))); + static_assert(has_single_bit(UInt(one << 5))); + static_assert(has_single_bit(UInt(one << 6))); + static_assert(has_single_bit(UInt(one << 7))); + + return true; +} + +TEST_BIT_MANIPULATION(has_single_bit); + +template +constexpr auto test_bit_floor(UInt x) -> decltype(Kokkos::bit_floor(x)) { + using Kokkos::bit_floor; + + static_assert(noexcept(bit_floor(x))); + static_assert(std::is_same_v); + + constexpr auto max = Kokkos::Experimental::finite_max_v; + + static_assert(bit_floor(UInt(0)) == 0); + static_assert(bit_floor(UInt(1)) == 1); + static_assert(bit_floor(UInt(2)) == 2); + static_assert(bit_floor(UInt(3)) == 2); + static_assert(bit_floor(UInt(4)) == 4); + static_assert(bit_floor(UInt(5)) == 4); + static_assert(bit_floor(UInt(6)) == 4); + static_assert(bit_floor(UInt(7)) == 4); + static_assert(bit_floor(UInt(8)) == 8); + static_assert(bit_floor(UInt(9)) == 8); + static_assert(bit_floor(UInt(125)) == 64); + static_assert(bit_floor(UInt(126)) == 64); + static_assert(bit_floor(UInt(127)) == 64); + static_assert(bit_floor(UInt(128)) == 128); + static_assert(bit_floor(UInt(129)) == 128); + static_assert(bit_floor(max) == UInt(max - (max >> 1))); + + return true; +} + +TEST_BIT_MANIPULATION(bit_floor); + +template +constexpr auto test_bit_ceil(UInt x) -> decltype(Kokkos::bit_ceil(x)) { + using Kokkos::bit_ceil; + + static_assert(noexcept(bit_ceil(x))); + static_assert(std::is_same_v); + + static_assert(bit_ceil(UInt(0)) == 1); + static_assert(bit_ceil(UInt(1)) == 1); + static_assert(bit_ceil(UInt(2)) == 2); + static_assert(bit_ceil(UInt(3)) == 4); + static_assert(bit_ceil(UInt(4)) == 4); + static_assert(bit_ceil(UInt(5)) == 8); + static_assert(bit_ceil(UInt(6)) == 8); + static_assert(bit_ceil(UInt(7)) == 8); + static_assert(bit_ceil(UInt(8)) == 8); + static_assert(bit_ceil(UInt(9)) == 16); + static_assert(bit_ceil(UInt(60)) == 64); + static_assert(bit_ceil(UInt(61)) == 64); + static_assert(bit_ceil(UInt(62)) == 64); + static_assert(bit_ceil(UInt(63)) == 64); + static_assert(bit_ceil(UInt(64)) == 64); + static_assert(bit_ceil(UInt(65)) == 128); + static_assert(bit_ceil(UInt(66)) == 128); + static_assert(bit_ceil(UInt(67)) == 128); + static_assert(bit_ceil(UInt(68)) == 128); + static_assert(bit_ceil(UInt(69)) == 128); + + return true; +} + +TEST_BIT_MANIPULATION(bit_ceil); + +template +constexpr auto test_bit_width(UInt x) -> decltype(Kokkos::bit_width(x)) { + using Kokkos::bit_width; + + static_assert(noexcept(bit_width(x))); + static_assert(std::is_same_v); + + constexpr auto dig = Kokkos::Experimental::digits_v; + constexpr auto max = Kokkos::Experimental::finite_max_v; + + static_assert(bit_width(UInt(0)) == 0); + static_assert(bit_width(UInt(1)) == 1); + static_assert(bit_width(UInt(2)) == 2); + static_assert(bit_width(UInt(3)) == 2); + static_assert(bit_width(UInt(4)) == 3); + static_assert(bit_width(UInt(5)) == 3); + static_assert(bit_width(UInt(6)) == 3); + static_assert(bit_width(UInt(7)) == 3); + static_assert(bit_width(UInt(8)) == 4); + static_assert(bit_width(UInt(9)) == 4); + + static_assert(bit_width(UInt(max - 1)) == dig); + static_assert(bit_width(max) == dig); + + return true; +} + +TEST_BIT_MANIPULATION(bit_width); +// + +#undef TEST_BIT_MANIPULATION From c4a5ad08fa0a2e52006ac80f694395030d5df1b9 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 16 Feb 2023 14:56:48 +0100 Subject: [PATCH 233/496] Update HPX::print_configuration - Update title from "HPX backend" to "Host Parallel Execution Space" to be consistent with other backends - Add KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH - Print HPX build information and number of worker threads in "HPX Runtime Configuration" --- core/src/HPX/Kokkos_HPX.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/core/src/HPX/Kokkos_HPX.cpp b/core/src/HPX/Kokkos_HPX.cpp index c4204f7402..19978c7cf3 100644 --- a/core/src/HPX/Kokkos_HPX.cpp +++ b/core/src/HPX/Kokkos_HPX.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -88,10 +89,18 @@ hpx::condition_variable_any HPX::m_active_parallel_region_count_cond; HPX::instance_data HPX::m_default_instance_data; void HPX::print_configuration(std::ostream &os, const bool) const { - os << "HPX backend\n"; - os << "HPX Execution Space:\n"; + os << "Host Parallel Execution Space\n"; os << " KOKKOS_ENABLE_HPX: yes\n"; + os << "HPX Options:\n"; +#if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) + os << " KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH: yes\n"; +#else + os << " KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH: no\n"; +#endif os << "\nHPX Runtime Configuration:\n"; + os << "Worker threads: " << hpx::get_num_worker_threads() << '\n'; + os << hpx::complete_version() << '\n'; + os << hpx::configuration_string() << '\n'; } void HPX::impl_decrement_active_parallel_region_count() { From dff272ff15cf145ad111a8b5c11f5eb68929b213 Mon Sep 17 00:00:00 2001 From: Etienne Malaboeuf Date: Wed, 22 Feb 2023 09:17:53 +0100 Subject: [PATCH 234/496] Fix CMake deduplication issue when linking with hip::device Fixes #5898 Signed-off-by: Etienne Malaboeuf --- cmake/kokkos_arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 993e177629..e320d204da 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -216,7 +216,7 @@ GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) IF(KOKKOS_ENABLE_HIP) SET(AMDGPU_ARCH_FLAG "--offload-arch") IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -x hip) + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -xhip) IF(DEFINED ENV{ROCM_PATH}) GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) ENDIF() From b166d77516f6a5b42a4df5ad0dec2c72ac71eaed Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 20 Feb 2023 22:23:52 -0500 Subject: [PATCH 235/496] Add `Experimental::*_builtin` counterpart to the bit manipulation template functions --- core/src/Kokkos_BitManipulation.hpp | 200 +++++++++ core/unit_test/CMakeLists.txt | 1 + .../unit_test/TestBitManipulationBuiltins.hpp | 419 ++++++++++++++++++ 3 files changed, 620 insertions(+) create mode 100644 core/unit_test/TestBitManipulationBuiltins.hpp diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index 66088922a5..a0bbac31ae 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -155,4 +155,204 @@ bit_width(T x) noexcept { } // namespace Kokkos +namespace Kokkos::Impl { + +#if defined(KOKKOS_COMPILER_CLANG) || defined(KOKKOS_COMPILER_GCC) +#define KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS +#endif + +template +KOKKOS_IMPL_DEVICE_FUNCTION + std::enable_if_t, int> + countl_zero_builtin_device(T x) noexcept { +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + if constexpr (sizeof(T) == sizeof(long long int)) { + return __clzll(reinterpret_cast(x)); + } else if constexpr (sizeof(T) == sizeof(int)) { + return __clz(reinterpret_cast(x)); + } else { + using ::Kokkos::Experimental::digits_v; + constexpr int shift = digits_v - digits_v; + return __clz(x) - shift; + } +#elif defined(KOKKOS_ENABLE_SYCL) + return sycl::clz(x); +#else + return countl_zero_fallback(x); +#endif +} + +template +KOKKOS_IMPL_HOST_FUNCTION + std::enable_if_t, int> + countl_zero_builtin_host(T x) noexcept { + using ::Kokkos::Experimental::digits_v; + if (x == 0) return digits_v; +#ifdef KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS + if constexpr (std::is_same_v) { + return __builtin_clzll(x); + } else if constexpr (std::is_same_v) { + return __builtin_clzl(x); + } else if constexpr (std::is_same_v) { + return __builtin_clz(x); + } else { + constexpr int shift = digits_v - digits_v; + return __builtin_clz(x) - shift; + } +#else + return countl_zero_fallback(x); +#endif +} + +template +KOKKOS_IMPL_DEVICE_FUNCTION + std::enable_if_t, int> + countr_zero_builtin_device(T x) noexcept { + using ::Kokkos::Experimental::digits_v; + if (x == 0) return digits_v; +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + if constexpr (sizeof(T) == sizeof(long long int)) { + return __ffsll(reinterpret_cast(x)) - 1; + } else { + return __ffs(reinterpret_cast(x)) - 1; + } +#elif defined(KOKKOS_ENABLE_SYCL) + return sycl::ctz(x); +#else + return countr_zero_fallback(x); +#endif +} + +template +KOKKOS_IMPL_HOST_FUNCTION + std::enable_if_t, int> + countr_zero_builtin_host(T x) noexcept { + using ::Kokkos::Experimental::digits_v; + if (x == 0) return digits_v; +#ifdef KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS + if constexpr (std::is_same_v) { + return __builtin_ctzll(x); + } else if constexpr (std::is_same_v) { + return __builtin_ctzl(x); + } else { + return __builtin_ctz(x); + } +#else + return countr_zero_fallback(x); +#endif +} + +template +KOKKOS_IMPL_DEVICE_FUNCTION + std::enable_if_t, int> + popcount_builtin_device(T x) noexcept { +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + if constexpr (sizeof(T) == sizeof(long long int)) { + return __popcll(x); + } else { + return __popc(x); + } +#elif defined(KOKKOS_ENABLE_SYCL) + return sycl::popcount(x); +#else + return popcount_fallback(x); +#endif +} + +template +KOKKOS_IMPL_HOST_FUNCTION + std::enable_if_t, int> + popcount_builtin_host(T x) noexcept { +#ifdef KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS + if constexpr (std::is_same_v) { + return __builtin_popcountll(x); + } else if constexpr (std::is_same_v) { + return __builtin_popcountl(x); + } else { + return __builtin_popcount(x); + } +#else + return popcount_fallback(x); +#endif +} + +#undef KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS + +} // namespace Kokkos::Impl + +namespace Kokkos::Experimental { + +template +KOKKOS_FUNCTION std::enable_if_t< + ::Kokkos::Impl::is_standard_unsigned_integer_type_v, int> +countl_zero_builtin(T x) noexcept { + KOKKOS_IF_ON_DEVICE((return ::Kokkos::Impl::countl_zero_builtin_device(x);)) + KOKKOS_IF_ON_HOST((return ::Kokkos::Impl::countl_zero_builtin_host(x);)) +} + +template +KOKKOS_FUNCTION std::enable_if_t< + ::Kokkos::Impl::is_standard_unsigned_integer_type_v, int> +countl_one_builtin(T x) noexcept { + if (x == finite_max_v) return digits_v; + return countl_zero_builtin(static_cast(~x)); +} + +template +KOKKOS_FUNCTION std::enable_if_t< + ::Kokkos::Impl::is_standard_unsigned_integer_type_v, int> +countr_zero_builtin(T x) noexcept { + KOKKOS_IF_ON_DEVICE((return ::Kokkos::Impl::countr_zero_builtin_device(x);)) + KOKKOS_IF_ON_HOST((return ::Kokkos::Impl::countr_zero_builtin_host(x);)) +} + +template +KOKKOS_FUNCTION std::enable_if_t< + ::Kokkos::Impl::is_standard_unsigned_integer_type_v, int> +countr_one_builtin(T x) noexcept { + if (x == finite_max_v) return digits_v; + return countr_zero_builtin(static_cast(~x)); +} + +template +KOKKOS_FUNCTION std::enable_if_t< + ::Kokkos::Impl::is_standard_unsigned_integer_type_v, int> +popcount_builtin(T x) noexcept { + KOKKOS_IF_ON_DEVICE((return ::Kokkos::Impl::popcount_builtin_device(x);)) + KOKKOS_IF_ON_HOST((return ::Kokkos::Impl::popcount_builtin_host(x);)) +} + +template +KOKKOS_FUNCTION std::enable_if_t< + ::Kokkos::Impl::is_standard_unsigned_integer_type_v, bool> +has_single_bit_builtin(T x) noexcept { + return has_single_bit(x); // no benefit to call the _builtin variant +} + +template +KOKKOS_FUNCTION + std::enable_if_t<::Kokkos::Impl::is_standard_unsigned_integer_type_v, T> + bit_ceil_builtin(T x) noexcept { + if (x <= 1) return 1; + return T{1} << (digits_v - countl_zero_builtin(static_cast(x - 1))); +} + +template +KOKKOS_FUNCTION + std::enable_if_t<::Kokkos::Impl::is_standard_unsigned_integer_type_v, T> + bit_floor_builtin(T x) noexcept { + if (x == 0) return 0; + return T{1} << (digits_v - 1 - countl_zero_builtin(x)); +} + +template +KOKKOS_FUNCTION + std::enable_if_t<::Kokkos::Impl::is_standard_unsigned_integer_type_v, T> + bit_width_builtin(T x) noexcept { + if (x == 0) return 0; + return digits_v - countl_zero_builtin(x); +} + +} // namespace Kokkos::Experimental + #endif diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 875a12df1b..d2531a3fc7 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -112,6 +112,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) # file. That then exceeded the shell command line max length. set(${Tag}_SOURCES1A) foreach(Name + BitManipulationBuiltins AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp new file mode 100644 index 0000000000..418cbd193b --- /dev/null +++ b/core/unit_test/TestBitManipulationBuiltins.hpp @@ -0,0 +1,419 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +// clang-format off +template +struct type_helper; +#define DEFINE_TYPE_NAME(T) \ +template <> struct type_helper { static char const * name() { return #T; } }; +DEFINE_TYPE_NAME(unsigned char) +DEFINE_TYPE_NAME(unsigned short) +DEFINE_TYPE_NAME(unsigned int) +DEFINE_TYPE_NAME(unsigned long) +DEFINE_TYPE_NAME(unsigned long long) +#undef DEFINE_TYPE_NAME +// clang-format on + +#define DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(FUNC) \ + struct BitManipFunction_##FUNC { \ + template \ + static KOKKOS_FUNCTION auto eval_constexpr(T x) { \ + return Kokkos::FUNC(x); \ + } \ + template \ + static KOKKOS_FUNCTION auto eval_builtin(T x) { \ + return Kokkos::Experimental::FUNC##_builtin(x); \ + } \ + static char const* name() { return #FUNC; } \ + } + +DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(countl_zero); +DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(countl_one); +DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(countr_zero); +DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(countr_one); +DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(popcount); +DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(has_single_bit); +DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(bit_ceil); +DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(bit_floor); +DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(bit_width); + +template +struct TestBitManipFunction { + Arg val_[N]; + TestBitManipFunction(const Arg (&val)[N]) { + std::copy(val, val + N, val_); + run(); + } + void run() const { + int errors = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, N), *this, errors); + ASSERT_EQ(errors, 0) << "Failed check no error for " << Func::name() << "(" + << type_helper::name() << ")"; + } + KOKKOS_FUNCTION void operator()(int i, int& e) const { + if (Func::eval_builtin(val_[i]) != Func::eval_constexpr(val_[i])) { + ++e; + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "value at %x which is %d was expected to be %d\n", (unsigned)val_[i], + (int)Func::eval_builtin(val_[i]), (int)Func::eval_constexpr(val_[i])); + } + } +}; + +template +void do_test_bit_manip_function(const Arg (&x)[N]) { + (void)std::initializer_list{ + (TestBitManipFunction(x), 0)...}; +} + +#define TEST_BIT_MANIP_FUNCTION(FUNC) \ + do_test_bit_manip_function + +template +void test_bit_manip_countl_zero() { + using Kokkos::Experimental::countl_zero_builtin; + static_assert(noexcept(countl_zero_builtin(UInt()))); + static_assert(std::is_same_v); + constexpr auto max = Kokkos::Experimental::finite_max_v; + TEST_BIT_MANIP_FUNCTION(countl_zero) + ({ + UInt(0), + UInt(1), + UInt(2), + UInt(3), + UInt(4), + UInt(5), + UInt(6), + UInt(7), + UInt(8), + UInt(9), + UInt(127), + UInt(128), + UInt(max), + }); +} + +TEST(TEST_CATEGORY, bit_manip_countl_zero) { + test_bit_manip_countl_zero(); + test_bit_manip_countl_zero(); + test_bit_manip_countl_zero(); + test_bit_manip_countl_zero(); + test_bit_manip_countl_zero(); +} + +template +void test_bit_manip_countl_one() { + using Kokkos::Experimental::countl_one_builtin; + static_assert(noexcept(countl_one_builtin(UInt()))); + static_assert(std::is_same_v); + constexpr auto dig = Kokkos::Experimental::digits_v; + constexpr auto max = Kokkos::Experimental::finite_max_v; + TEST_BIT_MANIP_FUNCTION(countl_one) + ({ + // clang-format off + UInt(0), + UInt(1), + UInt(2), + UInt(3), + UInt(4), + UInt(5), + UInt(6), + UInt(7), + UInt(8), + UInt(9), + UInt(100), + UInt(127), + UInt(128), + UInt(max), + UInt(max - 1), + UInt(max - 2), + UInt(max - 3), + UInt(max - 4), + UInt(max - 5), + UInt(max - 6), + UInt(max - 7), + UInt(max - 8), + UInt(max - 9), + UInt(max - 126), + UInt(max - 127), + UInt(max - 128), + UInt(UInt(1) << (dig - 1)), + UInt(UInt(3) << (dig - 2)), + UInt(UInt(7) << (dig - 3)), + UInt(UInt(255) << (dig - 8)), + // clang-format on + }); +} + +TEST(TEST_CATEGORY, bit_manip_countl_one) { + test_bit_manip_countl_one(); + test_bit_manip_countl_one(); + test_bit_manip_countl_one(); + test_bit_manip_countl_one(); + test_bit_manip_countl_one(); +} + +template +void test_bit_manip_countr_zero() { + using Kokkos::Experimental::countr_zero_builtin; + static_assert(noexcept(countr_zero_builtin(UInt()))); + static_assert(std::is_same_v); + constexpr auto max = Kokkos::Experimental::finite_max_v; + TEST_BIT_MANIP_FUNCTION(countr_zero) + ({ + UInt(0), + UInt(1), + UInt(2), + UInt(3), + UInt(4), + UInt(5), + UInt(6), + UInt(7), + UInt(8), + UInt(9), + UInt(126), + UInt(127), + UInt(128), + UInt(129), + UInt(130), + UInt(max), + }); +} + +TEST(TEST_CATEGORY, bit_manip_countr_zero) { + test_bit_manip_countr_zero(); + test_bit_manip_countr_zero(); + test_bit_manip_countr_zero(); + test_bit_manip_countr_zero(); + test_bit_manip_countr_zero(); +} + +template +void test_bit_manip_countr_one() { + using Kokkos::Experimental::countr_one_builtin; + static_assert(noexcept(countr_one_builtin(UInt()))); + static_assert(std::is_same_v); + constexpr auto max = Kokkos::Experimental::finite_max_v; + TEST_BIT_MANIP_FUNCTION(countr_one) + ({ + UInt(0), + UInt(1), + UInt(2), + UInt(3), + UInt(4), + UInt(5), + UInt(6), + UInt(7), + UInt(8), + UInt(9), + UInt(126), + UInt(127), + UInt(128), + UInt(max - 1), + UInt(max), + }); +} + +TEST(TEST_CATEGORY, bit_manip_countr_one) { + test_bit_manip_countr_one(); + test_bit_manip_countr_one(); + test_bit_manip_countr_one(); + test_bit_manip_countr_one(); + test_bit_manip_countr_one(); +} + +template +void test_bit_manip_popcount() { + using Kokkos::Experimental::popcount_builtin; + static_assert(noexcept(popcount_builtin(UInt()))); + static_assert(std::is_same_v); + constexpr auto max = Kokkos::Experimental::finite_max_v; + TEST_BIT_MANIP_FUNCTION(popcount) + ({ + UInt(0), + UInt(1), + UInt(2), + UInt(3), + UInt(4), + UInt(5), + UInt(6), + UInt(7), + UInt(8), + UInt(9), + UInt(127), + UInt(max), + UInt(max - 1), + }); +} + +TEST(TEST_CATEGORY, bit_manip_popcount) { + test_bit_manip_popcount(); + test_bit_manip_popcount(); + test_bit_manip_popcount(); + test_bit_manip_popcount(); + test_bit_manip_popcount(); +} + +template +void test_bit_manip_has_single_bit() { + using Kokkos::Experimental::has_single_bit_builtin; + static_assert(noexcept(has_single_bit_builtin(UInt()))); + static_assert(std::is_same_v); + constexpr auto max = Kokkos::Experimental::finite_max_v; + constexpr UInt one = 1; + TEST_BIT_MANIP_FUNCTION(has_single_bit) + ({ + // clang-format off + UInt(0), + UInt(1), + UInt(2), + UInt(3), + UInt(4), + UInt(5), + UInt(6), + UInt(7), + UInt(8), + UInt(9), + UInt(max), + UInt(one << 0), + UInt(one << 1), + UInt(one << 2), + UInt(one << 3), + UInt(one << 4), + UInt(one << 5), + UInt(one << 6), + UInt(one << 7), + // clang-format on + }); +} + +TEST(TEST_CATEGORY, bit_manip_has_single_bit) { + test_bit_manip_has_single_bit(); + test_bit_manip_has_single_bit(); + test_bit_manip_has_single_bit(); + test_bit_manip_has_single_bit(); + test_bit_manip_has_single_bit(); +} + +template +void test_bit_manip_bit_floor() { + using Kokkos::Experimental::bit_floor_builtin; + static_assert(noexcept(bit_floor_builtin(UInt()))); + static_assert(std::is_same_v); + constexpr auto max = Kokkos::Experimental::finite_max_v; + TEST_BIT_MANIP_FUNCTION(bit_floor) + ({ + UInt(0), + UInt(1), + UInt(2), + UInt(3), + UInt(4), + UInt(5), + UInt(6), + UInt(7), + UInt(8), + UInt(9), + UInt(125), + UInt(126), + UInt(127), + UInt(128), + UInt(129), + UInt(max), + }); +} + +TEST(TEST_CATEGORY, bit_manip_bit_floor) { + test_bit_manip_bit_floor(); + test_bit_manip_bit_floor(); + test_bit_manip_bit_floor(); + test_bit_manip_bit_floor(); + test_bit_manip_bit_floor(); +} + +template +void test_bit_manip_bit_ceil() { + using Kokkos::Experimental::bit_ceil_builtin; + static_assert(noexcept(bit_ceil_builtin(UInt()))); + static_assert(std::is_same_v); + TEST_BIT_MANIP_FUNCTION(bit_ceil) + ({ + // clang-format off + UInt(0), + UInt(1), + UInt(2), + UInt(3), + UInt(4), + UInt(5), + UInt(6), + UInt(7), + UInt(8), + UInt(9), + UInt(60), + UInt(61), + UInt(62), + UInt(63), + UInt(64), + UInt(65), + UInt(66), + UInt(67), + UInt(68), + UInt(69), + // clang-format on + }); +} + +TEST(TEST_CATEGORY, bit_manip_bit_ceil) { + test_bit_manip_bit_ceil(); + test_bit_manip_bit_ceil(); + test_bit_manip_bit_ceil(); + test_bit_manip_bit_ceil(); + test_bit_manip_bit_ceil(); +} + +template +void test_bit_manip_bit_width() { + using Kokkos::Experimental::bit_width_builtin; + static_assert(noexcept(bit_width_builtin(UInt()))); + static_assert(std::is_same_v); + constexpr auto max = Kokkos::Experimental::finite_max_v; + TEST_BIT_MANIP_FUNCTION(bit_width) + ({ + UInt(0), + UInt(1), + UInt(2), + UInt(3), + UInt(4), + UInt(5), + UInt(6), + UInt(7), + UInt(8), + UInt(9), + UInt(max - 1), + UInt(max), + }); +} + +TEST(TEST_CATEGORY, bit_manip_bit_width) { + test_bit_manip_bit_width(); + test_bit_manip_bit_width(); + test_bit_manip_bit_width(); + test_bit_manip_bit_width(); + test_bit_manip_bit_width(); +} From 75a3e80efe09d7fb4214f22443672d1f21700103 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 22 Feb 2023 10:20:12 -0500 Subject: [PATCH 236/496] Disable uchar test to work around broken sycl::ctz on NVIDIA GPUs --- core/unit_test/TestBitManipulationBuiltins.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp index 418cbd193b..29bce726fd 100644 --- a/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/core/unit_test/TestBitManipulationBuiltins.hpp @@ -198,7 +198,11 @@ void test_bit_manip_countr_zero() { } TEST(TEST_CATEGORY, bit_manip_countr_zero) { - test_bit_manip_countr_zero(); +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_ARCH_INTEL_GPU) // FIXME_SYCL returns wrong result + if (!std::is_same_v) +#endif + test_bit_manip_countr_zero(); test_bit_manip_countr_zero(); test_bit_manip_countr_zero(); test_bit_manip_countr_zero(); @@ -232,7 +236,11 @@ void test_bit_manip_countr_one() { } TEST(TEST_CATEGORY, bit_manip_countr_one) { - test_bit_manip_countr_one(); +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_ARCH_INTEL_GPU) // FIXME_SYCL returns wrong result + if (!std::is_same_v) +#endif + test_bit_manip_countr_one(); test_bit_manip_countr_one(); test_bit_manip_countr_one(); test_bit_manip_countr_one(); From e53f2240136364d30c52ed536c7d9364777a1348 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 16 Feb 2023 17:11:12 -0500 Subject: [PATCH 237/496] Cleanup prefer {traits:: -> }rank[_dynamic] --- core/src/Kokkos_View.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 356a293299..321bb56e3c 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -1815,12 +1815,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, typename rhs_traits::array_layout>::value && std::is_same::value && - unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && - lhs.data() == rhs.data() && lhs.span() == rhs.span() && - lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && - lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && - lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && - lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); + lhs.rank() == rhs.rank() && lhs.data() == rhs.data() && + lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && + lhs.extent(1) == rhs.extent(1) && lhs.extent(2) == rhs.extent(2) && + lhs.extent(3) == rhs.extent(3) && lhs.extent(4) == rhs.extent(4) && + lhs.extent(5) == rhs.extent(5) && lhs.extent(6) == rhs.extent(6) && + lhs.extent(7) == rhs.extent(7); } template From 5b9f300bf9fedd3350c4ef933a443536dc2532d0 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 17 Feb 2023 09:36:35 -0700 Subject: [PATCH 238/496] Fix another error with MSVC where we need to use rank() --- core/src/Kokkos_View.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 321bb56e3c..4eb1749833 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -1815,7 +1815,8 @@ KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, typename rhs_traits::array_layout>::value && std::is_same::value && - lhs.rank() == rhs.rank() && lhs.data() == rhs.data() && + View::rank() == View::rank() && + lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && lhs.extent(4) == rhs.extent(4) && From 47844ce3dde3c4188872430383e612373fb454e6 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Wed, 22 Feb 2023 09:28:18 -0700 Subject: [PATCH 239/496] Fix more rank style changes in MSVC/CUDA build --- containers/src/Kokkos_DynRankView.hpp | 4 ++-- containers/src/Kokkos_OffsetView.hpp | 2 +- core/src/Kokkos_View.hpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 0ff6b43f50..5b47323cb7 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -346,7 +346,7 @@ class ViewMapping< dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle::assign( src.m_map.m_impl_handle, src.m_track.m_tracker); dst.m_track.assign(src.m_track.m_tracker, DstTraits::is_managed); - dst.m_rank = src.rank; + dst.m_rank = Kokkos::View::rank(); } }; @@ -1025,7 +1025,7 @@ class DynRankView : public ViewTraits { // Copy/Assign View to DynRankView template KOKKOS_INLINE_FUNCTION DynRankView(const View& rhs) - : m_track(), m_map(), m_rank(rhs.rank) { + : m_track(), m_map(), m_rank(View::rank()) { using SrcTraits = typename View::traits; using Mapping = Kokkos::Impl::ViewMapping { "Incompatible OffsetView copy construction"); Mapping::assign(m_map, aview.impl_map(), m_track); - for (size_t i = 0; i < aview.rank; ++i) { + for (size_t i = 0; i < View::rank(); ++i) { m_begins[i] = 0; } } diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 4eb1749833..6c528d2b1b 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -1688,7 +1688,7 @@ class View : public ViewTraits { template KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View& V) { - return V.rank(); + return View::rank(); } namespace Impl { From 40750093ede88a49eaab0a84bf5fe1d0cb2a7996 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Wed, 22 Feb 2023 09:28:46 -0700 Subject: [PATCH 240/496] Work around a failing CTAD occurance on MSVC/CUDA --- core/unit_test/cuda/TestCuda_Spaces.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/unit_test/cuda/TestCuda_Spaces.cpp b/core/unit_test/cuda/TestCuda_Spaces.cpp index 407aa60a0a..d71984a75a 100644 --- a/core/unit_test/cuda/TestCuda_Spaces.cpp +++ b/core/unit_test/cuda/TestCuda_Spaces.cpp @@ -374,7 +374,8 @@ template View create_view() { using execution_space = typename View::execution_space; View view("", 10); - InitFunctor iota(view); + // MSVC+CUDA errors on CTAD here + InitFunctor iota(view); Kokkos::parallel_for("test_view_subview_const_randomaccess", Kokkos::RangePolicy(0, view.extent(0)), iota); From 43ec33eabc7e6136d2ee90fd70ede7afb45be514 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Wed, 22 Feb 2023 09:29:50 -0700 Subject: [PATCH 241/496] Work around a bug in MSVC/CUDA in a function. Nesting the functions in some partial specialized classes seems to make the issue go away ... --- core/src/impl/Kokkos_ViewCtor.hpp | 46 +++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/core/src/impl/Kokkos_ViewCtor.hpp b/core/src/impl/Kokkos_ViewCtor.hpp index 92c17d0cf5..25ab290f94 100644 --- a/core/src/impl/Kokkos_ViewCtor.hpp +++ b/core/src/impl/Kokkos_ViewCtor.hpp @@ -234,6 +234,8 @@ struct ViewCtorProp : public ViewCtorProp... { } }; + +#if !defined(KOKKOS_COMPILER_MSVC) || !defined(KOKKOS_COMPILER_NVCC) template auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop) { return view_ctor_prop; @@ -274,6 +276,50 @@ auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, #endif #endif } +#else + +template +struct WithPropertiesIfUnset; + +template +struct WithPropertiesIfUnset { + static constexpr auto apply_prop(const ViewCtorP &view_ctor_prop) { + return view_ctor_prop; + } +}; + +template +struct WithPropertiesIfUnset, Property, Properties...> { + static constexpr auto apply_prop(const ViewCtorProp &view_ctor_prop, const Property& prop, const Properties&... properties) { + if constexpr ((is_execution_space::value && + !ViewCtorProp::has_execution_space) || + (is_memory_space::value && + !ViewCtorProp::has_memory_space) || + (is_view_label::value && + !ViewCtorProp::has_label) || + (std::is_same_v && + ViewCtorProp::initialize)) { + using NewViewCtorProp = ViewCtorProp; + NewViewCtorProp new_view_ctor_prop(view_ctor_prop); + static_cast &>(new_view_ctor_prop).value = + prop; + return WithPropertiesIfUnset::apply_prop(new_view_ctor_prop, properties...); + } else + return WithPropertiesIfUnset, Properties...>::apply_prop( + view_ctor_prop, properties...); + + } +}; + + +template +auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, const Properties& ... properties) { + return WithPropertiesIfUnset, Properties...>::apply_prop( + view_ctor_prop, properties...); +} + + +#endif struct ExecutionSpaceTag {}; struct MemorySpaceTag {}; From c74aa417d2da22c4a9a0ffdd60715e3baf22157d Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Wed, 22 Feb 2023 09:31:15 -0700 Subject: [PATCH 242/496] Split math function test further, to work around compilation issue with MSVC/CUDA --- core/unit_test/CMakeLists.txt | 3 +++ core/unit_test/TestMathematicalFunctions.hpp | 8 ++++++- core/unit_test/TestMathematicalFunctions1.hpp | 2 ++ core/unit_test/TestMathematicalFunctions2.hpp | 2 ++ core/unit_test/TestMathematicalFunctions3.hpp | 21 +++++++++++++++++++ 5 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 core/unit_test/TestMathematicalFunctions3.hpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 588fb219f8..dc374711e6 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -140,6 +140,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) MathematicalConstants MathematicalFunctions1 MathematicalFunctions2 + MathematicalFunctions3 MDRange_a MDRange_b MDRange_c @@ -524,6 +525,7 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions1.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions2.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions3.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_a.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c01.cpp @@ -1125,6 +1127,7 @@ KOKKOS_ADD_EXECUTABLE_AND_TEST( # This test is not properly set up to run within Trilinos if (NOT KOKKOS_HAS_TRILINOS) + SET_SOURCE_FILES_PROPERTIES(UnitTest_DeviceAndThreads.cpp PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) add_executable(KokkosCore_UnitTest_DeviceAndThreads UnitTest_DeviceAndThreads.cpp) target_link_libraries(KokkosCore_UnitTest_DeviceAndThreads Kokkos::kokkoscore) find_package(Python3 COMPONENTS Interpreter) diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index dfcd2340a2..fe7074d5a1 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -311,7 +311,7 @@ struct math_function_name; }; \ constexpr char math_function_name::name[] -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3 // Generally the expected ULP error should come from here: // https://www.gnu.org/software/libc/manual/html_node/Errors-in-Math-Functions.html // For now 1s largely seem to work ... @@ -327,7 +327,9 @@ DEFINE_UNARY_FUNCTION_EVAL(log, 2); DEFINE_UNARY_FUNCTION_EVAL(log10, 2); DEFINE_UNARY_FUNCTION_EVAL(log2, 2); DEFINE_UNARY_FUNCTION_EVAL(log1p, 2); +#endif +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 DEFINE_UNARY_FUNCTION_EVAL(sqrt, 2); DEFINE_UNARY_FUNCTION_EVAL(cbrt, 2); @@ -729,7 +731,9 @@ TEST(TEST_CATEGORY, mathematical_functions_fma) { do_test_math_ternary_function(2.l, 3.l, 4.l); #endif } +#endif +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3 TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) { TEST_MATH_FUNCTION(exp)({-9, -8, -7, -6, -5, 4, 3, 2, 1, 0}); TEST_MATH_FUNCTION(exp)({-9l, -8l, -7l, -6l, -5l, 4l, 3l, 2l, 1l, 0l}); @@ -821,7 +825,9 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) { TEST_MATH_FUNCTION(log1p)({1234.l, 567.l, 89.l, -.007l}); #endif } +#endif +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 TEST(TEST_CATEGORY, mathematical_functions_hyperbolic_functions) { TEST_MATH_FUNCTION(sinh)({-3, -2, -1, 0, 1}); TEST_MATH_FUNCTION(sinh)({-3l, -2l, -1l, 0l, 1l}); diff --git a/core/unit_test/TestMathematicalFunctions1.hpp b/core/unit_test/TestMathematicalFunctions1.hpp index d902a04422..7452d45e42 100644 --- a/core/unit_test/TestMathematicalFunctions1.hpp +++ b/core/unit_test/TestMathematicalFunctions1.hpp @@ -15,5 +15,7 @@ //@HEADER #define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 +#define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3 #include "TestMathematicalFunctions.hpp" #undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 +#undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3 diff --git a/core/unit_test/TestMathematicalFunctions2.hpp b/core/unit_test/TestMathematicalFunctions2.hpp index 58572ebe6f..72f792b089 100644 --- a/core/unit_test/TestMathematicalFunctions2.hpp +++ b/core/unit_test/TestMathematicalFunctions2.hpp @@ -15,5 +15,7 @@ //@HEADER #define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3 #include "TestMathematicalFunctions.hpp" #undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3 diff --git a/core/unit_test/TestMathematicalFunctions3.hpp b/core/unit_test/TestMathematicalFunctions3.hpp new file mode 100644 index 0000000000..3d7b356367 --- /dev/null +++ b/core/unit_test/TestMathematicalFunctions3.hpp @@ -0,0 +1,21 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 +#include "TestMathematicalFunctions.hpp" +#undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 From 66e1437039e9e28f23b6427d6499b9fae69fffbe Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Wed, 22 Feb 2023 11:34:49 -0700 Subject: [PATCH 243/496] Apply clang-format --- core/src/Kokkos_View.hpp | 11 +++++------ core/src/impl/Kokkos_ViewCtor.hpp | 23 ++++++++++++----------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 6c528d2b1b..8da99326d1 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -1816,12 +1816,11 @@ KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, std::is_same::value && View::rank() == View::rank() && - lhs.data() == rhs.data() && - lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && - lhs.extent(1) == rhs.extent(1) && lhs.extent(2) == rhs.extent(2) && - lhs.extent(3) == rhs.extent(3) && lhs.extent(4) == rhs.extent(4) && - lhs.extent(5) == rhs.extent(5) && lhs.extent(6) == rhs.extent(6) && - lhs.extent(7) == rhs.extent(7); + lhs.data() == rhs.data() && lhs.span() == rhs.span() && + lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && + lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && + lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && + lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); } template diff --git a/core/src/impl/Kokkos_ViewCtor.hpp b/core/src/impl/Kokkos_ViewCtor.hpp index 25ab290f94..e1b8ba86a5 100644 --- a/core/src/impl/Kokkos_ViewCtor.hpp +++ b/core/src/impl/Kokkos_ViewCtor.hpp @@ -234,7 +234,6 @@ struct ViewCtorProp : public ViewCtorProp... { } }; - #if !defined(KOKKOS_COMPILER_MSVC) || !defined(KOKKOS_COMPILER_NVCC) template auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop) { @@ -278,7 +277,7 @@ auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, } #else -template +template struct WithPropertiesIfUnset; template @@ -290,7 +289,9 @@ struct WithPropertiesIfUnset { template struct WithPropertiesIfUnset, Property, Properties...> { - static constexpr auto apply_prop(const ViewCtorProp &view_ctor_prop, const Property& prop, const Properties&... properties) { + static constexpr auto apply_prop(const ViewCtorProp &view_ctor_prop, + const Property &prop, + const Properties &... properties) { if constexpr ((is_execution_space::value && !ViewCtorProp::has_execution_space) || (is_memory_space::value && @@ -303,22 +304,22 @@ struct WithPropertiesIfUnset, Property, Properties...> { NewViewCtorProp new_view_ctor_prop(view_ctor_prop); static_cast &>(new_view_ctor_prop).value = prop; - return WithPropertiesIfUnset::apply_prop(new_view_ctor_prop, properties...); + return WithPropertiesIfUnset::apply_prop( + new_view_ctor_prop, properties...); } else - return WithPropertiesIfUnset, Properties...>::apply_prop( - view_ctor_prop, properties...); - + return WithPropertiesIfUnset, + Properties...>::apply_prop(view_ctor_prop, + properties...); } }; - -template -auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, const Properties& ... properties) { +template +auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, + const Properties &... properties) { return WithPropertiesIfUnset, Properties...>::apply_prop( view_ctor_prop, properties...); } - #endif struct ExecutionSpaceTag {}; From 89213174099dfab25791ea5192a31a863e0828c8 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Wed, 22 Feb 2023 11:40:22 -0700 Subject: [PATCH 244/496] Silence unused parameter warning --- core/src/Kokkos_View.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 8da99326d1..1e399f9c59 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -1687,7 +1687,7 @@ class View : public ViewTraits { }; template -KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View& V) { +KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View&) { return View::rank(); } From 4ec9fb6656a0b01b29d0b6279210ed020e188afb Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Thu, 16 Feb 2023 11:42:09 -0700 Subject: [PATCH 245/496] Add AMD ROCm support to hpcbind --- bin/hpcbind | 85 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 67 insertions(+), 18 deletions(-) diff --git a/bin/hpcbind b/bin/hpcbind index cb2af2c4b5..32503c7efc 100755 --- a/bin/hpcbind +++ b/bin/hpcbind @@ -36,8 +36,14 @@ fi ################################################################################ declare -i HPCBIND_HAS_NVIDIA=0 type nvidia-smi >/dev/null 2>&1 -HPCBIND_HAS_NVIDIA=$((!$?)) +HPCBIND_HAS_NVIDIA=$((! $?)) +################################################################################ +# Check if rocm-smi exist +################################################################################ +declare -i HPCBIND_HAS_AMD=0 +type rocm-smi >/dev/null 2>&1 +HPCBIND_HAS_AMD=$((! $?)) ################################################################################ # Get visible gpu @@ -45,11 +51,30 @@ HPCBIND_HAS_NVIDIA=$((!$?)) declare -i NUM_GPUS=0 HPCBIND_VISIBLE_GPUS="" if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then - NUM_GPUS=$(nvidia-smi -L | wc -l); - HPCBIND_HAS_NVIDIA=$((!$?)) + nvidia-smi >/dev/null 2>&1 + HPCBIND_HAS_NVIDIA=$((! $?)) if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then - GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )" - HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}} + NUM_GPUS=$(nvidia-smi -L | wc -l); + HPCBIND_HAS_NVIDIA=$((! $?)) + if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then + GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )" + HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}} + fi + fi +fi + +if [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then + # rocm-smi doesn't have an error code if there is no hardware + # check for /sys/module/amdgpu/initstate instead + stat /sys/module/amdgpu/initstate >/dev/null 2>&1 + HPCBIND_HAS_AMD=$((! $?)) + if [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then + NUM_GPUS=$(rocm-smi -i --csv | sed '/^$/d' | tail -n +2 | wc -l); + HPCBIND_HAS_AMD=$((! $?)) + if [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then + GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )" + HPCBIND_VISIBLE_GPUS=${ROCR_VISIBLE_DEVICES:-${GPU_LIST}} + fi fi fi @@ -101,8 +126,8 @@ fi function show_help { local cmd=$(basename "$0") echo "Usage: ${cmd} -- command ..." - echo " Set the process mask, OMP environment variables and CUDA environment" - echo " variables to sane values if possible. Uses hwloc and nvidia-smi if" + echo " Set the process mask, OMP environment variables and CUDA/ROCm environment" + echo " variables to sane values if possible. Uses hwloc and nvidia-smi/rocm-smi if" echo " available. Will preserve the current process binding, so it is safe" echo " to use with a queuing system or mpiexec." echo "" @@ -116,10 +141,10 @@ function show_help { echo " --distribute-partition=I" echo " Use the i'th partition (zero based)" echo " --visible-gpus= Comma separated list of gpu ids" - echo " Default: CUDA_VISIBLE_DEVICES or all gpus in" + echo " Default: CUDA_VISIBLE_DEVICES/ROCR_VISIBLE_DEVICES or all gpus in" echo " sequential order" echo " --ignore-queue Ignore queue job id when choosing visible GPU and partition" - echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES" + echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES/ROCR_VISIBLE_DEVICES" echo " --openmp=M.m Set env variables for the given OpenMP version" echo " Default: 4.0" echo " --openmp-ratio=N/D Ratio of the cpuset to use for OpenMP" @@ -525,13 +550,24 @@ fi ################################################################################ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then - if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then - declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS)) - export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" - else - declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION)) - declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS)) - export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" + if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then + if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then + declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS)) + export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" + else + declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION)) + declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS)) + export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" + fi + elif [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then + if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then + declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS)) + export ROCR_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" + else + declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION)) + declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS)) + export ROCR_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" + fi fi fi @@ -541,6 +577,7 @@ fi export HPCBIND_HWLOC_VERSION=${HPCBIND_HWLOC_VERSION} export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC} export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA} +export HPCBIND_HAS_AMD=${HPCBIND_HAS_AMD} export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS} export HPCBIND_NUM_CORES=${HPCBIND_NUM_CORES} export HPCBIND_NUM_NUMAS=${HPCBIND_NUM_NUMAS} @@ -555,8 +592,14 @@ else export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}" fi export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}" -export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING} -export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',') +if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then + export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING} + export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',') +fi +if [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then + export HPCBIND_AMD_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING} + export HPCBIND_AMD_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',') +fi export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}" if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then export HPCBIND_QUEUE_RANK=${HPCBIND_QUEUE_RANK} @@ -580,6 +623,9 @@ if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then echo "${TMP_ENV}" | grep -E "^HWLOC_" >> ${HPCBIND_LOG} echo "[CUDA]" >> ${HPCBIND_LOG} echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG} + echo "[ROCM]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^ROCM_" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^ROCR_" >> ${HPCBIND_LOG} echo "[OPENMP]" >> ${HPCBIND_LOG} echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG} echo "[GOMP] (gcc, g++, and gfortran)" >> ${HPCBIND_LOG} @@ -602,6 +648,9 @@ else echo "${TMP_ENV}" | grep -E "^HWLOC_" > >(tee -a ${HPCBIND_LOG}) echo "[CUDA]" > >(tee -a ${HPCBIND_LOG}) echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG}) + echo "[ROCM]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^ROCM_" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^ROCR_" > >(tee -a ${HPCBIND_LOG}) echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG}) echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG}) echo "[GOMP] (gcc, g++, and gfortran)" > >(tee -a ${HPCBIND_LOG}) From 22ee14e0ba0fb74c8a22dfe666e6ae3ed73e0571 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 22 Feb 2023 22:14:35 -0500 Subject: [PATCH 246/496] Implement `rot{l,r}` function templates --- core/src/Kokkos_BitManipulation.hpp | 70 +++++++ core/unit_test/TestBitManipulation.cpp | 99 ++++++++++ .../unit_test/TestBitManipulationBuiltins.hpp | 173 ++++++++++++++++++ 3 files changed, 342 insertions(+) diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index a0bbac31ae..1f173224e2 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -153,6 +153,32 @@ bit_width(T x) noexcept { } // +// +template +[[nodiscard]] KOKKOS_FUNCTION constexpr std::enable_if_t< + Impl::is_standard_unsigned_integer_type_v, T> +rotl(T x, int s) noexcept { + using Experimental::digits_v; + constexpr auto dig = digits_v; + int const rem = s % dig; + if (rem == 0) return x; + if (rem > 0) return (x << rem) | (x >> ((dig - rem) % dig)); + return (x >> -rem) | (x << ((dig + rem) % dig)); // rotr(x, -rem) +} + +template +[[nodiscard]] KOKKOS_FUNCTION constexpr std::enable_if_t< + Impl::is_standard_unsigned_integer_type_v, T> +rotr(T x, int s) noexcept { + using Experimental::digits_v; + constexpr auto dig = digits_v; + int const rem = s % dig; + if (rem == 0) return x; + if (rem > 0) return (x >> rem) | (x << ((dig - rem) % dig)); + return (x << -rem) | (x >> ((dig + rem) % dig)); // rotl(x, -rem) +} +// + } // namespace Kokkos namespace Kokkos::Impl { @@ -278,6 +304,34 @@ KOKKOS_IMPL_HOST_FUNCTION #undef KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS +template +KOKKOS_FUNCTION T rotl_builtin_host(T x, int s) noexcept { + return rotl(x, s); +} + +template +KOKKOS_FUNCTION T rotl_builtin_device(T x, int s) noexcept { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::rotate(x, s); +#else + return rotl(x, s); +#endif +} + +template +KOKKOS_FUNCTION T rotr_builtin_host(T x, int s) noexcept { + return rotr(x, s); +} + +template +KOKKOS_FUNCTION T rotr_builtin_device(T x, int s) noexcept { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::rotate(x, -s); +#else + return rotr(x, s); +#endif +} + } // namespace Kokkos::Impl namespace Kokkos::Experimental { @@ -353,6 +407,22 @@ KOKKOS_FUNCTION return digits_v - countl_zero_builtin(x); } +template +[[nodiscard]] KOKKOS_FUNCTION + std::enable_if_t<::Kokkos::Impl::is_standard_unsigned_integer_type_v, T> + rotl_builtin(T x, int s) noexcept { + KOKKOS_IF_ON_DEVICE((return ::Kokkos::Impl::rotl_builtin_device(x, s);)) + KOKKOS_IF_ON_HOST((return ::Kokkos::Impl::rotl_builtin_host(x, s);)) +} + +template +[[nodiscard]] KOKKOS_FUNCTION + std::enable_if_t<::Kokkos::Impl::is_standard_unsigned_integer_type_v, T> + rotr_builtin(T x, int s) noexcept { + KOKKOS_IF_ON_DEVICE((return ::Kokkos::Impl::rotr_builtin_device(x, s);)) + KOKKOS_IF_ON_HOST((return ::Kokkos::Impl::rotr_builtin_host(x, s);)) +} + } // namespace Kokkos::Experimental #endif diff --git a/core/unit_test/TestBitManipulation.cpp b/core/unit_test/TestBitManipulation.cpp index f0baeef35e..13987ecf66 100644 --- a/core/unit_test/TestBitManipulation.cpp +++ b/core/unit_test/TestBitManipulation.cpp @@ -32,6 +32,105 @@ struct X { static_assert(test_##FUNC((float)0).did_not_match()); \ static_assert(test_##FUNC((void*)0).did_not_match()) +// +template +constexpr auto test_rotl(UInt x) -> decltype(Kokkos::rotl(x, 0)) { + using Kokkos::rotl; + + static_assert(noexcept(rotl(x, 0))); + static_assert(std::is_same_v); + + constexpr auto dig = Kokkos::Experimental::digits_v; + constexpr auto max = Kokkos::Experimental::finite_max_v; + + static_assert(rotl(UInt(0), 0) == 0); + static_assert(rotl(UInt(0), 1) == 0); + static_assert(rotl(UInt(0), 4) == 0); + static_assert(rotl(UInt(0), 8) == 0); + static_assert(rotl(max, 0) == max); + static_assert(rotl(max, 1) == max); + static_assert(rotl(max, 4) == max); + static_assert(rotl(max, 8) == max); + static_assert(rotl(UInt(1), 0) == UInt(1) << 0); + static_assert(rotl(UInt(1), 1) == UInt(1) << 1); + static_assert(rotl(UInt(1), 4) == UInt(1) << 4); + static_assert(rotl(UInt(1), dig) == UInt(1)); + static_assert(rotl(UInt(7), dig) == UInt(7)); + static_assert(rotl(UInt(6), dig - 1) == UInt(3)); + static_assert(rotl(UInt(3), 6) == UInt(3) << 6); + + static_assert(rotl(UInt(max - 1), 0) == UInt(max - 1)); + static_assert(rotl(UInt(max - 1), 1) == UInt(max - 2)); + static_assert(rotl(UInt(max - 1), 2) == UInt(max - 4)); + static_assert(rotl(UInt(max - 1), 3) == UInt(max - 8)); + static_assert(rotl(UInt(max - 1), 4) == UInt(max - 16)); + static_assert(rotl(UInt(max - 1), 5) == UInt(max - 32)); + static_assert(rotl(UInt(max - 1), 6) == UInt(max - 64)); + static_assert(rotl(UInt(max - 1), 7) == UInt(max - 128)); + static_assert(rotl(UInt(1), 0) == UInt(1)); + static_assert(rotl(UInt(1), 1) == UInt(2)); + static_assert(rotl(UInt(1), 2) == UInt(4)); + static_assert(rotl(UInt(1), 3) == UInt(8)); + static_assert(rotl(UInt(1), 4) == UInt(16)); + static_assert(rotl(UInt(1), 5) == UInt(32)); + static_assert(rotl(UInt(1), 6) == UInt(64)); + static_assert(rotl(UInt(1), 7) == UInt(128)); + + return true; +} + +TEST_BIT_MANIPULATION(rotl); + +template +constexpr auto test_rotr(UInt x) -> decltype(Kokkos::rotr(x, 0)) { + using Kokkos::rotr; + + static_assert(noexcept(rotr(x, 0))); + static_assert(std::is_same_v); + + constexpr auto dig = Kokkos::Experimental::digits_v; + constexpr auto max = Kokkos::Experimental::finite_max_v; + constexpr auto highbit = rotr(UInt(1), 1); + + static_assert(rotr(UInt(0), 0) == 0); + static_assert(rotr(UInt(0), 1) == 0); + static_assert(rotr(UInt(0), 4) == 0); + static_assert(rotr(UInt(0), 8) == 0); + static_assert(rotr(max, 0) == max); + static_assert(rotr(max, 1) == max); + static_assert(rotr(max, 4) == max); + static_assert(rotr(max, 8) == max); + static_assert(rotr(UInt(128), 0) == UInt(128) >> 0); + static_assert(rotr(UInt(128), 1) == UInt(128) >> 1); + static_assert(rotr(UInt(128), 4) == UInt(128) >> 4); + static_assert(rotr(UInt(1), dig) == UInt(1)); + static_assert(rotr(UInt(7), dig) == UInt(7)); + static_assert(rotr(UInt(6), dig - 1) == UInt(12)); + static_assert(rotr(UInt(36), dig - 2) == UInt(144)); + + static_assert(rotr(UInt(max - 1), 0) == UInt(max - 1)); + static_assert(rotr(UInt(max - 1), 1) == UInt(max - highbit)); + static_assert(rotr(UInt(max - 1), 2) == UInt(max - (highbit >> 1))); + static_assert(rotr(UInt(max - 1), 3) == UInt(max - (highbit >> 2))); + static_assert(rotr(UInt(max - 1), 4) == UInt(max - (highbit >> 3))); + static_assert(rotr(UInt(max - 1), 5) == UInt(max - (highbit >> 4))); + static_assert(rotr(UInt(max - 1), 6) == UInt(max - (highbit >> 5))); + static_assert(rotr(UInt(max - 1), 7) == UInt(max - (highbit >> 6))); + static_assert(rotr(UInt(128), 0) == UInt(128)); + static_assert(rotr(UInt(128), 1) == UInt(64)); + static_assert(rotr(UInt(128), 2) == UInt(32)); + static_assert(rotr(UInt(128), 3) == UInt(16)); + static_assert(rotr(UInt(128), 4) == UInt(8)); + static_assert(rotr(UInt(128), 5) == UInt(4)); + static_assert(rotr(UInt(128), 6) == UInt(2)); + static_assert(rotr(UInt(128), 7) == UInt(1)); + + return true; +} + +TEST_BIT_MANIPULATION(rotr); +// + // template constexpr auto test_countl_zero(UInt x) -> decltype(Kokkos::countl_zero(x)) { diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp index 29bce726fd..9ab4e6e15f 100644 --- a/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/core/unit_test/TestBitManipulationBuiltins.hpp @@ -54,6 +54,8 @@ DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(bit_ceil); DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(bit_floor); DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(bit_width); +#undef DEFINE_BIT_MANIPULATION_FUNCTION_EVAL + template struct TestBitManipFunction { Arg val_[N]; @@ -425,3 +427,174 @@ TEST(TEST_CATEGORY, bit_manip_bit_width) { test_bit_manip_bit_width(); test_bit_manip_bit_width(); } + +#undef TEST_BIT_MANIP_FUNCTION + +#define DEFINE_BIT_ROTATE_FUNCTION_EVAL(FUNC) \ + struct BitRotateFunction_##FUNC { \ + template \ + static KOKKOS_FUNCTION auto eval_constexpr(T x, int s) { \ + return Kokkos::FUNC(x, s); \ + } \ + template \ + static KOKKOS_FUNCTION auto eval_builtin(T x, int s) { \ + return Kokkos::Experimental::FUNC##_builtin(x, s); \ + } \ + static char const* name() { return #FUNC; } \ + } + +DEFINE_BIT_ROTATE_FUNCTION_EVAL(rotl); +DEFINE_BIT_ROTATE_FUNCTION_EVAL(rotr); + +#undef DEFINE_BIT_ROTATE_FUNCTION_EVAL + +template +struct P { + using type = T; + T x; + int s; +}; + +template +struct TestBitRotateFunction { + Arg val_[N]; + TestBitRotateFunction(const Arg (&val)[N]) { + std::copy(val, val + N, val_); + run(); + } + void run() const { + int errors = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, N), *this, errors); + ASSERT_EQ(errors, 0) << "Failed check no error for " << Func::name() << "(" + << type_helper::name() << ", int)"; + } + KOKKOS_FUNCTION void operator()(int i, int& e) const { + if (Func::eval_builtin(val_[i].x, val_[i].s) != + Func::eval_constexpr(val_[i].x, val_[i].s)) { + ++e; + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "value at %x rotated by %d which is %x was expected to be %x\n", + (unsigned)val_[i].x, val_[i].s, + (unsigned)Func::eval_builtin(val_[i].x, val_[i].s), + (unsigned)Func::eval_constexpr(val_[i].x, val_[i].s)); + } + } +}; + +template +void do_test_bit_rotate_function(const Arg (&x)[N]) { + (void)std::initializer_list{ + (TestBitRotateFunction(x), 0)...}; +} + +#define TEST_BIT_ROTATE_FUNCTION(FUNC) \ + do_test_bit_rotate_function + +template +void test_bit_manip_rotl() { + using Kokkos::Experimental::rotl_builtin; + static_assert(noexcept(rotl_builtin(UInt(), 0))); + static_assert(std::is_same_v); + constexpr auto dig = Kokkos::Experimental::digits_v; + constexpr auto max = Kokkos::Experimental::finite_max_v; + TEST_BIT_ROTATE_FUNCTION(rotl) + ({ + // clang-format off + P{UInt(0), 0}, + P{UInt(0), 1}, + P{UInt(0), 4}, + P{UInt(0), 8}, + P{max, 0}, + P{max, 1}, + P{max, 4}, + P{max, 8}, + P{UInt(1), 0}, + P{UInt(1), 1}, + P{UInt(1), 4}, + P{UInt(1), dig}, + P{UInt(7), dig}, + P{UInt(6), dig - 1}, + P{UInt(3), 6}, + P{UInt(max - 1), 0}, + P{UInt(max - 1), 1}, + P{UInt(max - 1), 2}, + P{UInt(max - 1), 3}, + P{UInt(max - 1), 4}, + P{UInt(max - 1), 5}, + P{UInt(max - 1), 6}, + P{UInt(max - 1), 7}, + P{UInt(1), 0}, + P{UInt(1), 1}, + P{UInt(1), 2}, + P{UInt(1), 3}, + P{UInt(1), 4}, + P{UInt(1), 5}, + P{UInt(1), 6}, + P{UInt(1), 7}, + // clang-format on + }); +} + +TEST(TEST_CATEGORY, bit_manip_rotl) { + test_bit_manip_rotl(); + test_bit_manip_rotl(); + test_bit_manip_rotl(); + test_bit_manip_rotl(); + test_bit_manip_rotl(); +} + +template +void test_bit_manip_rotr() { + using Kokkos::rotr; + using Kokkos::Experimental::rotr_builtin; + static_assert(noexcept(rotr_builtin(UInt(), 0))); + static_assert(std::is_same_v); + constexpr auto dig = Kokkos::Experimental::digits_v; + constexpr auto max = Kokkos::Experimental::finite_max_v; + TEST_BIT_ROTATE_FUNCTION(rotr) + ({ + // clang-format off + P{UInt(0), 0}, + P{UInt(0), 1}, + P{UInt(0), 4}, + P{UInt(0), 8}, + P{max, 0}, + P{max, 1}, + P{max, 4}, + P{max, 8}, + P{UInt(128), 0}, + P{UInt(128), 1}, + P{UInt(128), 4}, + P{UInt(1), dig}, + P{UInt(7), dig}, + P{UInt(6), dig - 1}, + P{UInt(36), dig - 2}, + P{UInt(max - 1), 0}, + P{UInt(max - 1), 1}, + P{UInt(max - 1), 2}, + P{UInt(max - 1), 3}, + P{UInt(max - 1), 4}, + P{UInt(max - 1), 5}, + P{UInt(max - 1), 6}, + P{UInt(max - 1), 7}, + P{UInt(128), 0}, + P{UInt(128), 1}, + P{UInt(128), 2}, + P{UInt(128), 3}, + P{UInt(128), 4}, + P{UInt(128), 5}, + P{UInt(128), 6}, + P{UInt(128), 0}, + // clang-format on + }); +} + +TEST(TEST_CATEGORY, bit_manip_rotr) { + test_bit_manip_rotr(); + test_bit_manip_rotr(); + test_bit_manip_rotr(); + test_bit_manip_rotr(); + test_bit_manip_rotr(); +} + +#undef TEST_BIT_ROTATE_FUNCTION From ba195725cd24b37e151b87019474ef7fc22e46c1 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 23 Feb 2023 07:52:31 -0500 Subject: [PATCH 247/496] Use CombinedFunctorReducerType in ParallelReduce (#5874) * Use CombinedFunctorReducerType in ParallelReduce * Convert Serial backend * execute() might not be const * Fix compiling with SYCL * Fix OpenMPTarget * Improve Cuda and HIP * Fix compiling with tuning * Improve CUDA and HIP * Try fixing HIP * Implement Cuda * Fix Kokkos_Tools_Generic.hpp * Slightly improve setting up of joint functor reducer object * Address review comments * Avoid CUDA warning and simplify * Try to workaround CUDA RDC CI failure --- core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp | 12 +- .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 118 +++----- core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 129 +++------ core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 258 +++++------------- core/src/Cuda/Kokkos_Cuda_Team.hpp | 4 +- core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp | 4 +- core/src/HIP/Kokkos_HIP_Parallel_Range.hpp | 10 +- core/src/HIP/Kokkos_HIP_Parallel_Team.hpp | 6 +- core/src/HIP/Kokkos_HIP_Team.hpp | 4 +- core/src/HPX/Kokkos_HPX.hpp | 20 +- core/src/Kokkos_Core_fwd.hpp | 10 + core/src/Kokkos_ExecPolicy.hpp | 3 +- core/src/Kokkos_GraphNode.hpp | 23 +- core/src/Kokkos_Parallel_Reduce.hpp | 150 +++++++++- .../Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 2 +- .../Kokkos_OpenACC_ParallelReduce_Range.hpp | 2 +- .../Kokkos_OpenACC_ParallelReduce_Team.hpp | 2 +- core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp | 14 +- ...kkos_OpenMPTarget_ParallelReduce_Range.hpp | 4 +- ...okkos_OpenMPTarget_ParallelReduce_Team.hpp | 2 +- ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 4 +- .../Kokkos_OpenMPTarget_Parallel_Common.hpp | 14 +- core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp | 8 +- core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp | 6 +- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 4 +- .../Serial/Kokkos_Serial_Parallel_MDRange.hpp | 63 ++--- .../Serial/Kokkos_Serial_Parallel_Range.hpp | 74 ++--- .../Serial/Kokkos_Serial_Parallel_Team.hpp | 72 ++--- .../Kokkos_Threads_Parallel_MDRange.hpp | 4 +- .../Threads/Kokkos_Threads_Parallel_Range.hpp | 10 +- .../Threads/Kokkos_Threads_Parallel_Team.hpp | 4 +- core/src/impl/Kokkos_FunctorAnalysis.hpp | 51 +++- core/src/impl/Kokkos_Tools_Generic.hpp | 32 ++- core/unit_test/TestFunctorAnalysis.hpp | 6 +- core/unit_test/hip/TestHIP_ScanUnit.cpp | 2 +- .../Test05_ParallelReduce_RangePolicy.hpp | 2 +- 36 files changed, 520 insertions(+), 613 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp index b7df78a338..86649fa015 100644 --- a/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp +++ b/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -36,6 +36,13 @@ namespace Kokkos { namespace Impl { +// FIXME Remove once all backends implement the new reduce interface +template +struct PatternImplSpecializationFromTag< + Kokkos::ParallelReduceTag, CombinedFunctorReducer, PolicyType, Kokkos::Cuda> + : type_identity< + ParallelReduce> {}; + template class GraphNodeKernelImpl @@ -133,8 +140,9 @@ template struct get_graph_node_kernel_type : type_identity> {}; + CombinedFunctorReducer, + Kokkos::ParallelReduceTag>> {}; //============================================================================== // {{{1 diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 0015d1ea14..efd5157ff0 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -188,11 +188,13 @@ class ParallelFor, Kokkos::Cuda> { : m_functor(arg_functor), m_rp(arg_policy) {} }; -template -class ParallelReduce, ReducerType, - Kokkos::Cuda> { +template +class ParallelReduce, Kokkos::Cuda> { public: - using Policy = Kokkos::MDRangePolicy; + using Policy = Kokkos::MDRangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; private: using array_index_type = typename Policy::array_index_type; @@ -202,22 +204,10 @@ class ParallelReduce, ReducerType, using Member = typename Policy::member_type; using LaunchBounds = typename Policy::launch_bounds; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - typename Kokkos::Impl::if_c::value, - WorkTag, void>::type; - - using Analysis = - Kokkos::Impl::FunctorAnalysis; - public: - using pointer_type = typename Analysis::pointer_type; - using value_type = typename Analysis::value_type; - using reference_type = typename Analysis::reference_type; + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; using functor_type = FunctorType; using size_type = Cuda::size_type; using reducer_type = ReducerType; @@ -225,9 +215,8 @@ class ParallelReduce, ReducerType, // Algorithmic constraints: blockSize is a power of two AND blockDim.y == // blockDim.z == 1 - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; // used for workrange and nwork - const ReducerType m_reducer; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; size_type* m_scratch_space; @@ -241,7 +230,7 @@ class ParallelReduce, ReducerType, // Shall we use the shfl based reduction or not (only use it for static sized // types of more than 128bit static constexpr bool UseShflReduction = false; - //((sizeof(value_type)>2*sizeof(double)) && Analysis::StaticValueSize) + //((sizeof(value_type)>2*sizeof(double)) && ReducerType::static_value_size()) // Some crutch to do function overloading public: @@ -253,24 +242,22 @@ class ParallelReduce, ReducerType, inline __device__ void exec_range(reference_type update) const { Kokkos::Impl::Reduce::DeviceIterateTile(m_policy, m_functor, - update) + reference_type>( + m_policy, m_functor_reducer.get_functor(), update) .exec_range(); } inline __device__ void operator()() const { - typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); - const integral_nonzero_constant - word_count(Analysis::value_size( - ReducerConditional::select(m_functor, m_reducer)) / + const integral_nonzero_constant< + size_type, ReducerType::static_value_size() / sizeof(size_type)> + word_count(m_functor_reducer.get_reducer().value_size() / sizeof(size_type)); { - reference_type value = final_reducer.init(reinterpret_cast( - kokkos_impl_cuda_shared_memory() + - threadIdx.y * word_count.value)); + reference_type value = + m_functor_reducer.get_reducer().init(reinterpret_cast( + kokkos_impl_cuda_shared_memory() + + threadIdx.y * word_count.value)); // Number of blocks is bounded so that the reduction can be limited to two // passes. Each thread block is given an approximately equal amount of @@ -284,7 +271,7 @@ class ParallelReduce, ReducerType, // Reduce with final value at blockDim.y - 1 location. // Problem: non power-of-two blockDim if (cuda_single_inter_block_reduce_scan( - final_reducer, blockIdx.x, gridDim.x, + m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x, kokkos_impl_cuda_shared_memory(), m_scratch_space, m_scratch_flags)) { // This is the final block with the final result at the final threads' @@ -297,7 +284,8 @@ class ParallelReduce, ReducerType, : (m_unified_space ? m_unified_space : m_scratch_space); if (threadIdx.y == 0) { - final_reducer.final(reinterpret_cast(shared)); + m_functor_reducer.get_reducer().final( + reinterpret_cast(shared)); } if (CudaTraits::WarpSize < word_count.value) { @@ -316,7 +304,9 @@ class ParallelReduce, ReducerType, int shmem_size = cuda_single_inter_block_reduce_scan_shmem( f, n); - using closure_type = Impl::ParallelReduce; + using closure_type = + Impl::ParallelReduce, + Policy, Kokkos::Cuda>; cudaFuncAttributes attr = CudaParallelLaunch::get_cuda_func_attributes(); @@ -337,17 +327,15 @@ class ParallelReduce, ReducerType, } inline void execute() { - typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); - const auto nwork = m_policy.m_num_tiles; if (nwork) { int block_size = m_policy.m_prod_tile_dims; // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions // Nearest power of two - int exponent_pow_two = std::ceil(std::log2(block_size)); - block_size = std::pow(2, exponent_pow_two); - int suggested_blocksize = local_block_size(m_functor); + int exponent_pow_two = std::ceil(std::log2(block_size)); + block_size = std::pow(2, exponent_pow_two); + int suggested_blocksize = + local_block_size(m_functor_reducer.get_functor()); block_size = (block_size > suggested_blocksize) ? block_size @@ -355,14 +343,12 @@ class ParallelReduce, ReducerType, // than or equal to 512 m_scratch_space = cuda_internal_scratch_space( - m_policy.space(), Analysis::value_size(ReducerConditional::select( - m_functor, m_reducer)) * + m_policy.space(), m_functor_reducer.get_reducer().value_size() * block_size /* block_size == max block_count */); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); m_unified_space = cuda_internal_scratch_unified( - m_policy.space(), Analysis::value_size(ReducerConditional::select( - m_functor, m_reducer))); + m_policy.space(), m_functor_reducer.get_reducer().value_size()); // REQUIRED ( 1 , N , 1 ) const dim3 block(1, block_size, 1); @@ -374,8 +360,8 @@ class ParallelReduce, ReducerType, UseShflReduction ? 0 : cuda_single_inter_block_reduce_scan_shmem(m_functor, - block.y); + WorkTag>( + m_functor_reducer.get_functor(), block.y); CudaParallelLaunch( *this, grid, block, shmem, @@ -389,14 +375,12 @@ class ParallelReduce, ReducerType, "Kokkos::Impl::ParallelReduce::execute: " "Result Not Device Accessible"); - const int count = Analysis::value_count( - ReducerConditional::select(m_functor, m_reducer)); + const int count = m_functor_reducer.get_reducer().value_count(); for (int i = 0; i < count; ++i) { m_result_ptr[i] = pointer_type(m_unified_space)[i]; } } else { - const int size = Analysis::value_size( - ReducerConditional::select(m_functor, m_reducer)); + const int size = m_functor_reducer.get_reducer().value_size(); DeepCopy(m_policy.space(), m_result_ptr, m_scratch_space, size); } @@ -405,19 +389,16 @@ class ParallelReduce, ReducerType, } else { if (m_result_ptr) { // TODO @graph We need to effectively insert this in to the graph - final_reducer.init(m_result_ptr); + m_functor_reducer.get_reducer().init(m_result_ptr); } } } template - ParallelReduce( - const FunctorType& arg_functor, const Policy& arg_policy, - const ViewType& arg_result, - std::enable_if_t::value, void*> = nullptr) - : m_functor(arg_functor), + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result.data()), m_result_ptr_device_accessible( MemorySpaceAccess, ReducerType, m_scratch_space(nullptr), m_scratch_flags(nullptr), m_unified_space(nullptr) { - check_reduced_view_shmem_size(m_policy, m_functor); - } - - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ReducerType& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_scratch_space(nullptr), - m_scratch_flags(nullptr), - m_unified_space(nullptr) { - check_reduced_view_shmem_size(m_policy, m_functor); + check_reduced_view_shmem_size(m_policy, + m_functor_reducer.get_functor()); } }; } // namespace Impl diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index c5e89fc3da..620ef67927 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -114,11 +114,13 @@ class ParallelFor, Kokkos::Cuda> { : m_functor(arg_functor), m_policy(arg_policy) {} }; -template -class ParallelReduce, ReducerType, +template +class ParallelReduce, Kokkos::Cuda> { public: - using Policy = Kokkos::RangePolicy; + using Policy = Kokkos::RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; private: using WorkRange = typename Policy::WorkRange; @@ -126,22 +128,10 @@ class ParallelReduce, ReducerType, using Member = typename Policy::member_type; using LaunchBounds = typename Policy::launch_bounds; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - typename Kokkos::Impl::if_c::value, - WorkTag, void>::type; - - using Analysis = - Kokkos::Impl::FunctorAnalysis; - public: - using pointer_type = typename Analysis::pointer_type; - using value_type = typename Analysis::value_type; - using reference_type = typename Analysis::reference_type; + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; using functor_type = FunctorType; // Conditionally set word_size_type to int16_t or int8_t if value_type is // smaller than int32_t (Kokkos::Cuda::size_type) @@ -165,9 +155,8 @@ class ParallelReduce, ReducerType, // Algorithmic constraints: blockSize is a power of two AND blockDim.y == // blockDim.z == 1 - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; const bool m_result_ptr_host_accessible; @@ -179,7 +168,7 @@ class ParallelReduce, ReducerType, // FIXME_CUDA Shall we use the shfl based reduction or not (only use it for // static sized types of more than 128bit: - // sizeof(value_type)>2*sizeof(double)) && Analysis::StaticValueSize) + // sizeof(value_type)>2*sizeof(double)) && ReducerType::static_value_size()) static constexpr bool UseShflReduction = false; public: @@ -189,29 +178,27 @@ class ParallelReduce, ReducerType, template __device__ inline std::enable_if_t::value> exec_range( const Member& i, reference_type update) const { - m_functor(i, update); + m_functor_reducer.get_functor()(i, update); } template __device__ inline std::enable_if_t::value> exec_range( const Member& i, reference_type update) const { - m_functor(TagType(), i, update); + m_functor_reducer.get_functor()(TagType(), i, update); } __device__ inline void operator()() const { - typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); - - const integral_nonzero_constant - word_count(Analysis::value_size( - ReducerConditional::select(m_functor, m_reducer)) / + const integral_nonzero_constant + word_count(m_functor_reducer.get_reducer().value_size() / sizeof(word_size_type)); { - reference_type value = final_reducer.init(reinterpret_cast( - kokkos_impl_cuda_shared_memory() + - threadIdx.y * word_count.value)); + reference_type value = + m_functor_reducer.get_reducer().init(reinterpret_cast( + kokkos_impl_cuda_shared_memory() + + threadIdx.y * word_count.value)); // Number of blocks is bounded so that the reduction can be limited to two // passes. Each thread block is given an approximately equal amount of @@ -233,7 +220,7 @@ class ParallelReduce, ReducerType, bool do_final_reduction = true; if (!zero_length) do_final_reduction = cuda_single_inter_block_reduce_scan( - final_reducer, blockIdx.x, gridDim.x, + m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x, kokkos_impl_cuda_shared_memory(), m_scratch_space, m_scratch_flags); @@ -250,7 +237,8 @@ class ParallelReduce, ReducerType, : (m_unified_space ? m_unified_space : m_scratch_space); if (threadIdx.y == 0) { - final_reducer.final(reinterpret_cast(shared)); + m_functor_reducer.get_reducer().final( + reinterpret_cast(shared)); } if (CudaTraits::WarpSize < word_count.value) { @@ -269,7 +257,9 @@ class ParallelReduce, ReducerType, int shmem_size = cuda_single_inter_block_reduce_scan_shmem( f, n); - using closure_type = Impl::ParallelReduce; + using closure_type = + Impl::ParallelReduce, + Policy, Kokkos::Cuda>; cudaFuncAttributes attr = CudaParallelLaunch::get_cuda_func_attributes(); @@ -290,24 +280,20 @@ class ParallelReduce, ReducerType, } inline void execute() { - typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); - const index_type nwork = m_policy.end() - m_policy.begin(); - const bool need_device_set = Analysis::has_init_member_function || - Analysis::has_final_member_function || + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || !m_result_ptr_host_accessible || Policy::is_graph_kernel::value || !std::is_same::value; if ((nwork > 0) || need_device_set) { - const int block_size = local_block_size(m_functor); + const int block_size = local_block_size(m_functor_reducer.get_functor()); KOKKOS_ASSERT(block_size > 0); // TODO: down casting these uses more space than required? m_scratch_space = (word_size_type*)cuda_internal_scratch_space( - m_policy.space(), Analysis::value_size(ReducerConditional::select( - m_functor, m_reducer)) * + m_policy.space(), m_functor_reducer.get_reducer().value_size() * block_size /* block_size == max block_count */); // Intentionally do not downcast to word_size_type since we use Cuda @@ -316,8 +302,7 @@ class ParallelReduce, ReducerType, sizeof(Cuda::size_type)); m_unified_space = reinterpret_cast(cuda_internal_scratch_unified( - m_policy.space(), Analysis::value_size(ReducerConditional::select( - m_functor, m_reducer)))); + m_policy.space(), m_functor_reducer.get_reducer().value_size())); // REQUIRED ( 1 , N , 1 ) dim3 block(1, block_size, 1); @@ -330,8 +315,8 @@ class ParallelReduce, ReducerType, UseShflReduction ? 0 : cuda_single_inter_block_reduce_scan_shmem(m_functor, - block.y); + WorkTag>( + m_functor_reducer.get_functor(), block.y); if ((nwork == 0) #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION @@ -354,14 +339,12 @@ class ParallelReduce, ReducerType, "Kokkos::Impl::ParallelReduce::execute: " "Result " "Not Device Accessible"); - const int count = Analysis::value_count( - ReducerConditional::select(m_functor, m_reducer)); + const int count = m_functor_reducer.get_reducer().value_count(); for (int i = 0; i < count; ++i) { m_result_ptr[i] = pointer_type(m_unified_space)[i]; } } else { - const int size = Analysis::value_size( - ReducerConditional::select(m_functor, m_reducer)); + const int size = m_functor_reducer.get_reducer().value_size(); DeepCopy(m_policy.space(), m_result_ptr, m_scratch_space, size); } @@ -370,19 +353,16 @@ class ParallelReduce, ReducerType, } else { if (m_result_ptr) { // TODO @graph We need to effectively insert this in to the graph - final_reducer.init(m_result_ptr); + m_functor_reducer.get_reducer().init(m_result_ptr); } } } template - ParallelReduce( - const FunctorType& arg_functor, const Policy& arg_policy, - const ViewType& arg_result, - std::enable_if_t::value, void*> = nullptr) - : m_functor(arg_functor), + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result.data()), m_result_ptr_device_accessible( MemorySpaceAccess, ReducerType, m_scratch_space(nullptr), m_scratch_flags(nullptr), m_unified_space(nullptr) { - check_reduced_view_shmem_size(m_policy, m_functor); - } - - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ReducerType& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess::accessible), - m_scratch_space(nullptr), - m_scratch_flags(nullptr), - m_unified_space(nullptr) { - check_reduced_view_shmem_size(m_policy, m_functor); + check_reduced_view_shmem_size(m_policy, + m_functor_reducer.get_functor()); } }; @@ -484,7 +445,7 @@ class ParallelScan, Kokkos::Cuda> { //---------------------------------------- __device__ inline void initial() const { - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); const integral_nonzero_constant @@ -524,7 +485,7 @@ class ParallelScan, Kokkos::Cuda> { //---------------------------------------- __device__ inline void final() const { - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); const integral_nonzero_constant @@ -794,7 +755,7 @@ class ParallelScanWithTotal, //---------------------------------------- __device__ inline void initial() const { - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); const integral_nonzero_constant @@ -834,7 +795,7 @@ class ParallelScanWithTotal, //---------------------------------------- __device__ inline void final() const { - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); const integral_nonzero_constant diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 5855e38847..74fe87a65a 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -115,12 +115,10 @@ class TeamPolicyInternal using functor_analysis_type = Impl::FunctorAnalysis; - using reducer_type = typename Impl::ParallelReduceReturnValue< - void, typename functor_analysis_type::value_type, - FunctorType>::reducer_type; - using closure_type = - Impl::ParallelReduce, - reducer_type>; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer, + TeamPolicy, Kokkos::Cuda>; return internal_team_size_max(f); } @@ -128,8 +126,8 @@ class TeamPolicyInternal inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/, const ParallelReduceTag&) const { using closure_type = - Impl::ParallelReduce, - ReducerType>; + Impl::ParallelReduce, + TeamPolicy, Kokkos::Cuda>; return internal_team_size_max(f); } @@ -156,12 +154,10 @@ class TeamPolicyInternal using functor_analysis_type = Impl::FunctorAnalysis; - using reducer_type = typename Impl::ParallelReduceReturnValue< - void, typename functor_analysis_type::value_type, - FunctorType>::reducer_type; - using closure_type = - Impl::ParallelReduce, - reducer_type>; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer, + TeamPolicy, Kokkos::Cuda>; return internal_team_size_recommended(f); } @@ -169,8 +165,8 @@ class TeamPolicyInternal int team_size_recommended(const FunctorType& f, const ReducerType&, const ParallelReduceTag&) const { using closure_type = - Impl::ParallelReduce, - ReducerType>; + Impl::ParallelReduce, + TeamPolicy, Kokkos::Cuda>; return internal_team_size_recommended(f); } @@ -607,32 +603,22 @@ class ParallelFor, } }; -template -class ParallelReduce, - ReducerType, Kokkos::Cuda> { +template +class ParallelReduce, Kokkos::Cuda> { public: - using Policy = TeamPolicy; + using Policy = TeamPolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; private: using Member = typename Policy::member_type; using WorkTag = typename Policy::work_tag; using LaunchBounds = typename Policy::launch_bounds; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - typename Kokkos::Impl::if_c::value, - WorkTag, void>::type; - - using Analysis = - Kokkos::Impl::FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - using value_type = typename Analysis::value_type; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + using value_type = typename ReducerType::value_type; public: using functor_type = FunctorType; @@ -640,7 +626,7 @@ class ParallelReduce, using reducer_type = ReducerType; static constexpr bool UseShflReduction = - (true && (Analysis::StaticValueSize != 0)); + (true && (ReducerType::static_value_size() != 0)); private: struct ShflReductionTag {}; @@ -654,9 +640,8 @@ class ParallelReduce, // [ team shared space ] // - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; const bool m_result_ptr_host_accessible; @@ -678,13 +663,13 @@ class ParallelReduce, template __device__ inline std::enable_if_t::value> exec_team( const Member& member, reference_type update) const { - m_functor(member, update); + m_functor_reducer.get_functor()(member, update); } template __device__ inline std::enable_if_t::value> exec_team( const Member& member, reference_type update) const { - m_functor(TagType(), member, update); + m_functor_reducer.get_functor()(TagType(), member, update); } public: @@ -706,18 +691,14 @@ class ParallelReduce, } __device__ inline void run(SHMEMReductionTag&, const int& threadid) const { - typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); - - const integral_nonzero_constant - word_count(Analysis::value_size( - ReducerConditional::select(m_functor, m_reducer)) / + const integral_nonzero_constant< + size_type, ReducerType::static_value_size() / sizeof(size_type)> + word_count(m_functor_reducer.get_reducer().value_size() / sizeof(size_type)); - reference_type value = - final_reducer.init(kokkos_impl_cuda_shared_memory() + - threadIdx.y * word_count.value); + reference_type value = m_functor_reducer.get_reducer().init( + kokkos_impl_cuda_shared_memory() + + threadIdx.y * word_count.value); // Iterate this block through the league const int int_league_size = (int)m_league_size; @@ -738,7 +719,7 @@ class ParallelReduce, bool do_final_reduction = true; if (!zero_length) do_final_reduction = cuda_single_inter_block_reduce_scan( - final_reducer, blockIdx.x, gridDim.x, + m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x, kokkos_impl_cuda_shared_memory(), m_scratch_space, m_scratch_flags); @@ -754,7 +735,8 @@ class ParallelReduce, : (m_unified_space ? m_unified_space : m_scratch_space); if (threadIdx.y == 0) { - final_reducer.final(reinterpret_cast(shared)); + m_functor_reducer.get_reducer().final( + reinterpret_cast(shared)); } if (CudaTraits::WarpSize < word_count.value) { @@ -768,11 +750,8 @@ class ParallelReduce, } __device__ inline void run(ShflReductionTag, const int& threadid) const { - typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); - value_type value; - final_reducer.init(&value); + m_functor_reducer.get_reducer().init(&value); // Iterate this block through the league const int int_league_size = (int)m_league_size; @@ -795,29 +774,26 @@ class ParallelReduce, : m_scratch_space); value_type init; - final_reducer.init(&init); + m_functor_reducer.get_reducer().init(&init); if (int_league_size == 0) { - final_reducer.final(&value); + m_functor_reducer.get_reducer().final(&value); *result = value; - } else if (Impl::cuda_inter_block_reduction(value, init, final_reducer, - m_scratch_space, result, - m_scratch_flags, blockDim.y)) { + } else if (Impl::cuda_inter_block_reduction( + value, init, m_functor_reducer.get_reducer(), + m_scratch_space, result, m_scratch_flags, blockDim.y)) { const unsigned id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { - final_reducer.final(&value); + m_functor_reducer.get_reducer().final(&value); *result = value; } } } inline void execute() { - typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); - const bool is_empty_range = m_league_size == 0 || m_team_size == 0; - const bool need_device_set = Analysis::has_init_member_function || - Analysis::has_final_member_function || + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || !m_result_ptr_host_accessible || Policy::is_graph_kernel::value || !std::is_same::value; @@ -827,14 +803,12 @@ class ParallelReduce, : std::min(int(m_league_size), m_team_size)); m_scratch_space = cuda_internal_scratch_space( - m_policy.space(), Analysis::value_size(ReducerConditional::select( - m_functor, m_reducer)) * - block_count); + m_policy.space(), + m_functor_reducer.get_reducer().value_size() * block_count); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); m_unified_space = cuda_internal_scratch_unified( - m_policy.space(), Analysis::value_size(ReducerConditional::select( - m_functor, m_reducer))); + m_policy.space(), m_functor_reducer.get_reducer().value_size()); dim3 block(m_vector_size, m_team_size, 1); dim3 grid(block_count, 1, 1); @@ -861,14 +835,12 @@ class ParallelReduce, if (m_result_ptr) { if (m_unified_space) { - const int count = Analysis::value_count( - ReducerConditional::select(m_functor, m_reducer)); + const int count = m_functor_reducer.get_reducer().value_count(); for (int i = 0; i < count; ++i) { m_result_ptr[i] = pointer_type(m_unified_space)[i]; } } else { - const int size = Analysis::value_size( - ReducerConditional::select(m_functor, m_reducer)); + const int size = m_functor_reducer.get_reducer().value_size(); DeepCopy(m_result_ptr, m_scratch_space, size); } } @@ -876,19 +848,16 @@ class ParallelReduce, } else { if (m_result_ptr) { // TODO @graph We need to effectively insert this in to the graph - final_reducer.init(m_result_ptr); + m_functor_reducer.get_reducer().init(m_result_ptr); } } } template - ParallelReduce( - const FunctorType& arg_functor, const Policy& arg_policy, - const ViewType& arg_result, - std::enable_if_t::value, void*> = nullptr) - : m_functor(arg_functor), + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result.data()), m_result_ptr_device_accessible( MemorySpaceAccess, m_team_size >= 0 ? m_team_size : Kokkos::Impl::cuda_get_opt_block_size( - internal_space_instance, attr, m_functor, m_vector_size, + internal_space_instance, attr, + m_functor_reducer.get_functor(), m_vector_size, m_policy.team_scratch_size(0), m_policy.thread_scratch_size(0)) / m_vector_size; @@ -924,12 +894,12 @@ class ParallelReduce, UseShflReduction ? 0 : cuda_single_inter_block_reduce_scan_shmem(arg_functor, - m_team_size); + WorkTag>( + arg_functor_reducer.get_functor(), m_team_size); m_shmem_begin = sizeof(double) * (m_team_size + 2); - m_shmem_size = - m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value(arg_functor, m_team_size); + m_shmem_size = m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize::value( + arg_functor_reducer.get_functor(), m_team_size); m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); m_scratch_locks = internal_space_instance->m_scratch_locks; @@ -979,113 +949,9 @@ class ParallelReduce, } if (int(m_team_size) > - arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too " - "large team size.")); - } - } - - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ReducerType& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess::accessible), - m_scratch_space(nullptr), - m_scratch_flags(nullptr), - m_unified_space(nullptr), - m_team_begin(0), - m_shmem_begin(0), - m_shmem_size(0), - m_scratch_ptr{nullptr, nullptr}, - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { - auto internal_space_instance = - m_policy.space().impl_internal_space_instance(); - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); - - // Valid team size not provided, deduce team size - m_team_size = - m_team_size >= 0 - ? m_team_size - : Kokkos::Impl::cuda_get_opt_block_size( - internal_space_instance, attr, m_functor, m_vector_size, - m_policy.team_scratch_size(0), - m_policy.thread_scratch_size(0)) / - m_vector_size; - - m_team_begin = - UseShflReduction - ? 0 - : cuda_single_inter_block_reduce_scan_shmem(arg_functor, - m_team_size); - m_shmem_begin = sizeof(double) * (m_team_size + 2); - m_shmem_size = - m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value(arg_functor, m_team_size); - m_scratch_size[0] = m_shmem_size; - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; - m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; - if (m_team_size <= 0) { - m_scratch_ptr[1] = nullptr; - } else { - m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); - m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * - (std::min( - static_cast(Cuda().concurrency() / - (m_team_size * m_vector_size)), - static_cast(m_league_size)))); - } - - // The global parallel_reduce does not support vector_length other than 1 at - // the moment - if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " - "greater than 1 is not currently supported for CUDA for dynamic " - "sized reduction types."); - - if ((m_team_size < 32) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " - "than 32 is not currently supported with CUDA for dynamic sized " - "reduction types."); - - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - - if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) && - !UseShflReduction) || - internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size")); - } - - size_type team_size_max = - Kokkos::Impl::cuda_get_max_block_size( - internal_space_instance, attr, m_functor, m_vector_size, - m_policy.team_scratch_size(0), m_policy.thread_scratch_size(0)) / - m_vector_size; - - if ((int)m_team_size > (int)team_size_max) { + arg_policy.team_size_max(m_functor_reducer.get_functor(), + m_functor_reducer.get_reducer(), + ParallelReduceTag())) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too " "large team size.")); diff --git a/core/src/Cuda/Kokkos_Cuda_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Team.hpp index fc3f46bce6..f726cdc3fb 100644 --- a/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -198,7 +198,7 @@ class CudaTeamMember { KOKKOS_IF_ON_DEVICE( (typename Impl::FunctorAnalysis, ReducerType>::Reducer - wrapped_reducer(&reducer); + wrapped_reducer(reducer); cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y); reducer.reference() = value;)) } @@ -228,7 +228,7 @@ class CudaTeamMember { Impl::CudaJoinFunctor cuda_join_functor; typename Impl::FunctorAnalysis< Impl::FunctorPatternInterface::SCAN, TeamPolicy, - Impl::CudaJoinFunctor>::Reducer reducer(&cuda_join_functor); + Impl::CudaJoinFunctor>::Reducer reducer(cuda_join_functor); Impl::cuda_intra_block_reduce_scan(reducer, base_data + 1); if (global_accum) { diff --git a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp index 10ec301d15..5f7919fbae 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp @@ -219,7 +219,7 @@ class ParallelReduce, ReducerType, inline __device__ void operator()() const { typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); const integral_nonzero_constant @@ -292,7 +292,7 @@ class ParallelReduce, ReducerType, inline void execute() { typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); using ClosureType = ParallelReduce; diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index d8c52aa95f..e27b778c74 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -180,7 +180,7 @@ class ParallelReduce, ReducerType, sizeof(size_type)); typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); { reference_type value = final_reducer.init(reinterpret_cast( ::Kokkos::kokkos_impl_hip_shared_memory() + @@ -235,7 +235,7 @@ class ParallelReduce, ReducerType, __device__ inline void run(ShflReductionTag) const { typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); value_type value; final_reducer.init(&value); @@ -296,7 +296,7 @@ class ParallelReduce, ReducerType, inline void execute() { typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); const index_type nwork = m_policy.end() - m_policy.begin(); const bool need_device_set = Analysis::has_init_member_function || @@ -456,7 +456,7 @@ class ParallelScanHIPBase { //---------------------------------------- __device__ inline void initial() const { - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); const integral_nonzero_constant @@ -494,7 +494,7 @@ class ParallelScanHIPBase { //---------------------------------------- __device__ inline void final() const { - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); const integral_nonzero_constant diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index f823514042..13772dc903 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -674,7 +674,7 @@ class ParallelReduce, __device__ inline void run(SHMEMReductionTag, int const threadid) const { typename analysis::Reducer final_reducer( - &reducer_conditional::select(m_functor, m_reducer)); + reducer_conditional::select(m_functor, m_reducer)); integral_nonzero_constant const @@ -723,7 +723,7 @@ class ParallelReduce, __device__ inline void run(ShflReductionTag, int const threadid) const { typename analysis::Reducer final_reducer( - &reducer_conditional::select(m_functor, m_reducer)); + reducer_conditional::select(m_functor, m_reducer)); value_type value; final_reducer.init(&value); @@ -754,7 +754,7 @@ class ParallelReduce, inline void execute() { typename analysis::Reducer final_reducer( - &reducer_conditional::select(m_functor, m_reducer)); + reducer_conditional::select(m_functor, m_reducer)); const bool is_empty_range = m_league_size == 0 || m_team_size == 0; const bool need_device_set = analysis::has_init_member_function || diff --git a/core/src/HIP/Kokkos_HIP_Team.hpp b/core/src/HIP/Kokkos_HIP_Team.hpp index 5b9faba786..197c8c1882 100644 --- a/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_Team.hpp @@ -182,7 +182,7 @@ class HIPTeamMember { #ifdef __HIP_DEVICE_COMPILE__ typename Kokkos::Impl::FunctorAnalysis< FunctorPatternInterface::REDUCE, TeamPolicy, ReducerType>::Reducer - wrapped_reducer(&reducer); + wrapped_reducer(reducer); hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); reducer.reference() = value; #else @@ -219,7 +219,7 @@ class HIPTeamMember { Impl::HIPJoinFunctor hip_join_functor; typename Kokkos::Impl::FunctorAnalysis< FunctorPatternInterface::REDUCE, TeamPolicy, - Impl::HIPJoinFunctor>::Reducer reducer(&hip_join_functor); + Impl::HIPJoinFunctor>::Reducer reducer(hip_join_functor); Impl::hip_intra_block_reduce_scan(reducer, base_data + 1); if (global_accum) { diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp index e1abaf8837..dff7c53adb 100644 --- a/core/src/HPX/Kokkos_HPX.hpp +++ b/core/src/HPX/Kokkos_HPX.hpp @@ -1026,7 +1026,7 @@ class ParallelReduce, ReducerType, buffer.resize(num_worker_threads, value_size); typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); for (int t = 0; t < num_worker_threads; ++t) { final_reducer.init(reinterpret_cast(buffer.get(t))); @@ -1052,7 +1052,7 @@ class ParallelReduce, ReducerType, void finalize() const { hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); const int num_worker_threads = m_policy.space().concurrency(); for (int i = 1; i < num_worker_threads; ++i) { final_reducer.join(reinterpret_cast(buffer.get(0)), @@ -1078,7 +1078,7 @@ class ParallelReduce, ReducerType, if (m_policy.end() <= m_policy.begin()) { if (m_result_ptr) { typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); final_reducer.init(m_result_ptr); final_reducer.final(m_result_ptr); @@ -1151,7 +1151,7 @@ class ParallelReduce, ReducerType, const int num_worker_threads = m_policy.space().concurrency(); typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_iter.m_func, m_reducer)); + ReducerConditional::select(m_iter.m_func, m_reducer)); hpx_thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer(); buffer.resize(num_worker_threads, value_size); @@ -1175,7 +1175,7 @@ class ParallelReduce, ReducerType, void finalize() const { hpx_thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer(); typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_iter.m_func, m_reducer)); + ReducerConditional::select(m_iter.m_func, m_reducer)); const int num_worker_threads = m_policy.space().concurrency(); for (int i = 1; i < num_worker_threads; ++i) { final_reducer.join(reinterpret_cast(buffer.get(0)), @@ -1287,7 +1287,7 @@ class ParallelScan, const std::size_t value_size = Analysis::value_size(m_functor); hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); barrier_type &barrier = *static_cast(buffer.get_extra_space()); reference_type update_sum = @@ -1390,7 +1390,7 @@ class ParallelScanWithTotal, const std::size_t value_size = Analysis::value_size(m_functor); hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); barrier_type &barrier = *static_cast(buffer.get_extra_space()); reference_type update_sum = @@ -1554,7 +1554,7 @@ class ParallelReduce, hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); buffer.resize(num_worker_threads, value_size + m_shared); typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); for (int t = 0; t < num_worker_threads; ++t) { final_reducer.init(reinterpret_cast(buffer.get(t))); @@ -1586,7 +1586,7 @@ class ParallelReduce, void finalize() const { hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); const int num_worker_threads = m_policy.space().concurrency(); const pointer_type ptr = reinterpret_cast(buffer.get(0)); for (int t = 1; t < num_worker_threads; ++t) { @@ -1609,7 +1609,7 @@ class ParallelReduce, if (m_policy.league_size() * m_policy.team_size() == 0) { if (m_result_ptr) { typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); final_reducer.init(m_result_ptr); final_reducer.final(m_result_ptr); } diff --git a/core/src/Kokkos_Core_fwd.hpp b/core/src/Kokkos_Core_fwd.hpp index 99986ef64e..6546c875ed 100644 --- a/core/src/Kokkos_Core_fwd.hpp +++ b/core/src/Kokkos_Core_fwd.hpp @@ -330,6 +330,16 @@ template ::execution_space> class ParallelReduce; +// FIXME Remove once all backends implement the new interface +template +class CombinedFunctorReducer; + +// FIXME Remove once all backends implement the new interface +template +class ParallelReduceWrapper; + /// \class ParallelScan /// \brief Implementation detail of parallel_scan. /// diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index 357c2572a5..b141d7c692 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -1079,9 +1079,10 @@ template struct PatternImplSpecializationFromTag : type_identity> {}; +// FIXME Drop "Wrapper" when all backends implement the new reduce interface template struct PatternImplSpecializationFromTag - : type_identity> {}; + : type_identity> {}; template struct PatternImplSpecializationFromTag diff --git a/core/src/Kokkos_GraphNode.hpp b/core/src/Kokkos_GraphNode.hpp index 1cfd2b382b..c35fe30e76 100644 --- a/core/src/Kokkos_GraphNode.hpp +++ b/core/src/Kokkos_GraphNode.hpp @@ -376,14 +376,29 @@ class GraphNodeRef { auto policy = Experimental::require((Policy &&) arg_policy, Kokkos::Impl::KernelInGraphProperty{}); + using passed_reducer_type = typename return_value_adapter::reducer_type; + + using reducer_selector = Kokkos::Impl::if_c< + std::is_same::value, functor_type, + passed_reducer_type>; + using analysis = Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, + typename reducer_selector::type>; + typename analysis::Reducer final_reducer( + reducer_selector::select(functor, return_value)); + Kokkos::Impl::CombinedFunctorReducer + functor_reducer(functor, final_reducer); + using next_policy_t = decltype(policy); - using next_kernel_t = Kokkos::Impl::GraphNodeKernelImpl< - ExecutionSpace, next_policy_t, functor_type, Kokkos::ParallelReduceTag, - typename return_value_adapter::reducer_type>; + using next_kernel_t = + Kokkos::Impl::GraphNodeKernelImpl; return this->_then_kernel(next_kernel_t{ std::move(arg_name), graph_impl_ptr->get_execution_space(), - (Functor &&) functor, (Policy &&) policy, + functor_reducer, (Policy &&) policy, return_value_adapter::return_value(return_value, functor)}); } diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index 68a2155bbd..6b1fb55b7f 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -1372,6 +1372,117 @@ StdPartitionPoint(View, Properties...> const&) namespace Kokkos { namespace Impl { +template +class CombinedFunctorReducer { + public: + using functor_type = FunctorType; + using reducer_type = FunctorAnalysisReducerType; + CombinedFunctorReducer(const FunctorType& functor, + const FunctorAnalysisReducerType& reducer) + : m_functor(functor), m_reducer(reducer) {} + KOKKOS_FUNCTION const FunctorType& get_functor() const { return m_functor; } + KOKKOS_FUNCTION const FunctorAnalysisReducerType& get_reducer() const { + return m_reducer; + } + + private: + FunctorType m_functor; + FunctorAnalysisReducerType m_reducer; +}; +template +class CombinedFunctorReducer< + FunctorType, FunctorAnalysisReducerType, + std::enable_if_t>> { + public: + using functor_type = FunctorType; + using reducer_type = FunctorAnalysisReducerType; + CombinedFunctorReducer(const FunctorType& functor, + const FunctorAnalysisReducerType&) + : m_reducer(functor) {} + KOKKOS_FUNCTION const FunctorType& get_functor() const { + return m_reducer.get_functor(); + } + KOKKOS_FUNCTION const FunctorAnalysisReducerType& get_reducer() const { + return m_reducer; + } + + private: + FunctorAnalysisReducerType m_reducer; +}; + +// FIXME Remove once all backends implement the new interface +template +struct implements_new_reduce_interface : std::false_type {}; + +#ifdef KOKKOS_ENABLE_SERIAL +template <> +struct implements_new_reduce_interface : std::true_type {}; +#endif + +#ifdef KOKKOS_ENABLE_CUDA +template <> +struct implements_new_reduce_interface : std::true_type {}; +#endif + +template +class ParallelReduceWrapper { + using functor_type = typename CombinedFunctorReducerType::functor_type; + using helper_reducer_type = + typename CombinedFunctorReducerType::reducer_type::functor_type; + + static constexpr bool has_reducer = + !std::is_same_v; + + using reducer_type = + std::conditional_t; + + public: + using wrapped_type = Impl::ParallelReduce; + + private: + wrapped_type m_parallel_reduce; + + public: + template + ParallelReduceWrapper( + const CombinedFunctorReducerType& combined_functor_reducer, + const PolicyType& policy, const ReturnValue& return_value) + : m_parallel_reduce( + combined_functor_reducer.get_functor(), policy, + Kokkos::Impl::if_c:: + select(combined_functor_reducer.get_reducer().get_functor(), + return_value)) {} + + void execute() { m_parallel_reduce.execute(); } +}; + +template +class ParallelReduceWrapper< + CombinedFunctorReducerType, PolicyType, ExecutionSpaceType, + std::enable_if_t< + implements_new_reduce_interface::value>> { + public: + using wrapped_type = Impl::ParallelReduce; + + private: + wrapped_type m_parallel_reduce; + + public: + template + ParallelReduceWrapper( + const CombinedFunctorReducerType& combined_functor_reducer, + const PolicyType& policy, const ReturnValue& return_value) + : m_parallel_reduce(combined_functor_reducer, policy, return_value) {} + + void execute() { m_parallel_reduce.execute(); } +}; + template struct ParallelReduceReturnValue; @@ -1437,12 +1548,12 @@ template struct ParallelReduceReturnValue< std::enable_if_t::value>, ReturnType, FunctorType> { - using return_type = ReturnType; + using return_type = typename ReturnType::result_view_type; using reducer_type = ReturnType; using value_type = typename return_type::value_type; - static return_type return_value(ReturnType& return_val, const FunctorType&) { - return return_val; + static auto return_value(ReturnType& return_val, const FunctorType&) { + return return_val.view(); } }; @@ -1490,24 +1601,35 @@ struct ParallelReduceAdaptor { const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { - uint64_t kpID = 0; + using PassedReducerType = typename return_value_adapter::reducer_type; + uint64_t kpID = 0; PolicyType inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_reduce< - typename return_value_adapter::reducer_type>(inner_policy, functor, - label, kpID); - + Kokkos::Tools::Impl::begin_parallel_reduce( + inner_policy, functor, label, kpID); + + using ReducerSelector = + Kokkos::Impl::if_c::value, + FunctorType, PassedReducerType>; + using Analysis = + FunctorAnalysis; Kokkos::Impl::shared_allocation_tracking_disable(); - Impl::ParallelReduce - closure(functor, inner_policy, + CombinedFunctorReducer functor_reducer( + functor, typename Analysis::Reducer( + ReducerSelector::select(functor, return_value))); + + // FIXME Remove "Wrapper" once all backends implement the new interface + Impl::ParallelReduceWrapper::execution_space> + closure(functor_reducer, inner_policy, return_value_adapter::return_value(return_value, functor)); Kokkos::Impl::shared_allocation_tracking_enable(); closure.execute(); - Kokkos::Tools::Impl::end_parallel_reduce< - typename return_value_adapter::reducer_type>(inner_policy, functor, - label, kpID); + Kokkos::Tools::Impl::end_parallel_reduce( + inner_policy, functor, label, kpID); } static constexpr bool is_array_reduction = diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index c4b7b6bdec..f36b95665c 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -86,7 +86,7 @@ class Kokkos::Impl::ParallelReduce, ValueType val; typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); final_reducer.init(&val); Kokkos::Experimental::Impl::OpenACCParallelReduceMDRangeHelper( diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp index 278d9c1d5a..a0b380dbbf 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp @@ -84,7 +84,7 @@ class Kokkos::Impl::ParallelReduce, ValueType val; typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); final_reducer.init(&val); Kokkos::Experimental::Impl::OpenACCParallelReduceHelper( diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index 199a2786ee..b9576f2ea7 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -69,7 +69,7 @@ class Kokkos::Impl::ParallelReduce, ReducerType, public: inline void execute() const { typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); if (m_policy.end() <= m_policy.begin()) { if (m_result_ptr) { @@ -550,7 +550,7 @@ class ParallelReduce, ReducerType, ); typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_iter.m_func, m_reducer)); + ReducerConditional::select(m_iter.m_func, m_reducer)); #ifndef KOKKOS_COMPILER_INTEL if (execute_in_serial(m_iter.m_rp.space())) { @@ -749,7 +749,7 @@ class ParallelScan, ); if (execute_in_serial(m_policy.space())) { - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); reference_type update = final_reducer.init( pointer_type(m_instance->get_thread_data(0)->pool_reduce_local())); @@ -763,7 +763,7 @@ class ParallelScan, #pragma omp parallel num_threads(m_instance->thread_pool_size()) { HostThreadTeamData& data = *(m_instance->get_thread_data()); - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); const WorkRange range(m_policy, omp_get_thread_num(), omp_get_num_threads()); @@ -881,7 +881,7 @@ class ParallelScanWithTotal, ); if (execute_in_serial(m_policy.space())) { - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); reference_type update = final_reducer.init( pointer_type(m_instance->get_thread_data(0)->pool_reduce_local())); @@ -899,7 +899,7 @@ class ParallelScanWithTotal, #pragma omp parallel num_threads(m_instance->thread_pool_size()) { HostThreadTeamData& data = *(m_instance->get_thread_data()); - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); const WorkRange range(m_policy, omp_get_thread_num(), omp_get_num_threads()); @@ -1202,7 +1202,7 @@ class ParallelReduce, enum { is_dynamic = std::is_same::value }; typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); if (m_policy.league_size() == 0 || m_policy.team_size() == 0) { if (m_result_ptr) { diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 1ac46b9919..1a574f16be 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -100,7 +100,7 @@ class ParallelReduce, ReducerType, } template - ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, const ViewType& arg_result_view, std::enable_if_t::value && !Kokkos::is_reducer::value, @@ -114,7 +114,7 @@ class ParallelReduce, ReducerType, typename ViewType::memory_space>::accessible), m_result_ptr_num_elems(arg_result_view.size()) {} - ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, const ReducerType& reducer) : m_functor(arg_functor), m_policy(arg_policy), diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index 236c6d6f7a..39d452864a 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -525,7 +525,7 @@ class ParallelReduce, FunctorTeamShmemSize::value( arg_functor, arg_policy.team_size())) {} - ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, const ReducerType& reducer) : m_result_ptr_on_device( MemorySpaceAccess, FunctorType a_functor(m_functor); #pragma omp target teams distribute map(to : a_functor) num_teams(nteams) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { - typename Analysis::Reducer final_reducer(&a_functor); + typename Analysis::Reducer final_reducer(a_functor); #pragma omp parallel num_threads(team_size) { const idx_type local_offset = team_id * chunk_size; @@ -120,7 +120,7 @@ class ParallelScan, : a_functor) num_teams(nteams) \ thread_limit(team_size) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { - typename Analysis::Reducer final_reducer(&a_functor); + typename Analysis::Reducer final_reducer(a_functor); #pragma omp parallel num_threads(team_size) { const idx_type local_offset = team_id * chunk_size; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp index 75b17b7235..49e7b33264 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp @@ -222,7 +222,7 @@ struct ParallelReduceSpecialize, #pragma omp target map(to : f) is_device_ptr(scratch_ptr) { - typename FunctorAnalysis::Reducer final_reducer(&f); + typename FunctorAnalysis::Reducer final_reducer(f); // Enter this loop if the functor has an `init` if constexpr (HasInit) { // The `init` routine needs to be called on the device since it might @@ -257,7 +257,7 @@ struct ParallelReduceSpecialize, map(to \ : f) is_device_ptr(scratch_ptr) { - typename FunctorAnalysis::Reducer final_reducer(&f); + typename FunctorAnalysis::Reducer final_reducer(f); #pragma omp parallel { const int team_num = omp_get_team_num(); @@ -304,7 +304,7 @@ struct ParallelReduceSpecialize, is_device_ptr(scratch_ptr) for (int i = 0; i < max_teams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { - typename FunctorAnalysis::Reducer final_reducer(&f); + typename FunctorAnalysis::Reducer final_reducer(f); ValueType* team_scratch = scratch_ptr; const int team_offset = max_team_threads * value_count; final_reducer.join( @@ -575,7 +575,7 @@ struct ParallelReduceSpecialize, // device members. #pragma omp target map(to : f) is_device_ptr(scratch_ptr) { - typename FunctorAnalysis::Reducer final_reducer(&f); + typename FunctorAnalysis::Reducer final_reducer(f); final_reducer.init(scratch_ptr); final_reducer.final(scratch_ptr); } @@ -586,7 +586,7 @@ struct ParallelReduceSpecialize, static_cast(scratch_ptr)[i] = ValueType(); } - typename FunctorAnalysis::Reducer final_reducer(&f); + typename FunctorAnalysis::Reducer final_reducer(f); final_reducer.final(static_cast(scratch_ptr)); } } @@ -616,7 +616,7 @@ struct ParallelReduceSpecialize, const int num_teams = omp_get_num_teams(); ValueType* team_scratch = static_cast(scratch_ptr) + team_num * team_size * value_count; - typename FunctorAnalysis::Reducer final_reducer(&f); + typename FunctorAnalysis::Reducer final_reducer(f); ReferenceType result = final_reducer.init(&team_scratch[0]); for (int league_id = team_num; league_id < league_size; @@ -642,7 +642,7 @@ struct ParallelReduceSpecialize, i += 2 * tree_neighbor_offset) { ValueType* team_scratch = static_cast(scratch_ptr); const int team_offset = team_size * value_count; - typename FunctorAnalysis::Reducer final_reducer(&f); + typename FunctorAnalysis::Reducer final_reducer(f); final_reducer.join( &team_scratch[i * team_offset], &team_scratch[(i + tree_neighbor_offset) * team_offset]); diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp index 5144e57a71..77a3e71d06 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp @@ -265,7 +265,7 @@ class ParallelReduce, ReducerType, const auto& selected_reducer = ReducerConditional::select( static_cast(functor), static_cast(reducer_wrapper.get_functor())); - typename Analysis::Reducer final_reducer(&selected_reducer); + typename Analysis::Reducer final_reducer(selected_reducer); reference_type update = final_reducer.init(results_ptr); if (size == 1) { if constexpr (std::is_void::value) @@ -311,7 +311,7 @@ class ParallelReduce, ReducerType, static_cast(functor), static_cast( reducer_wrapper.get_functor())); - typename Analysis::Reducer final_reducer(&selected_reducer); + typename Analysis::Reducer final_reducer(selected_reducer); using index_type = typename Policy::index_type; const auto upper_bound = std::min( @@ -629,7 +629,7 @@ class ParallelReduce, ReducerType, const auto& selected_reducer = ReducerConditional::select( static_cast(functor), static_cast(reducer_wrapper.get_functor())); - typename Analysis::Reducer final_reducer(&selected_reducer); + typename Analysis::Reducer final_reducer(selected_reducer); reference_type update = final_reducer.init(results_ptr); if (size == 1) { @@ -671,7 +671,7 @@ class ParallelReduce, ReducerType, const auto& selected_reducer = ReducerConditional::select( static_cast(functor), static_cast(reducer_wrapper.get_functor())); - typename Analysis::Reducer final_reducer(&selected_reducer); + typename Analysis::Reducer final_reducer(selected_reducer); // In the first iteration, we call functor to initialize the local // memory. Otherwise, the local memory is initialized with the diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp index 76c73b3452..bc62ecc452 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp @@ -145,7 +145,7 @@ class ParallelScanSYCLBase { sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), [=](sycl::nd_item<1> item) { const FunctorType& functor = functor_wrapper.get_functor(); - typename Analysis::Reducer final_reducer(&functor); + typename Analysis::Reducer final_reducer(functor); const auto local_id = item.get_local_linear_id(); const auto global_id = item.get_global_linear_id(); @@ -178,7 +178,7 @@ class ParallelScanSYCLBase { [=](sycl::nd_item<1> item) { const auto global_id = item.get_global_linear_id(); const FunctorType& functor = functor_wrapper.get_functor(); - typename Analysis::Reducer final_reducer(&functor); + typename Analysis::Reducer final_reducer(functor); if (global_id < size) final_reducer.join(&global_mem[global_id], &group_results[item.get_group_linear_id()]); @@ -208,7 +208,7 @@ class ParallelScanSYCLBase { const typename Policy::index_type id = static_cast(item.get_id()) + begin; const FunctorType& functor = functor_wrapper.get_functor(); - typename Analysis::Reducer final_reducer(&functor); + typename Analysis::Reducer final_reducer(functor); value_type update{}; final_reducer.init(&update); diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp index 489180361f..59e9a7d515 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -608,7 +608,7 @@ class ParallelReduce, static_cast(functor), static_cast( reducer_wrapper.get_functor())); - typename Analysis::Reducer final_reducer(&selected_reducer); + typename Analysis::Reducer final_reducer(selected_reducer); reference_type update = final_reducer.init(results_ptr); if (size == 1) { @@ -670,7 +670,7 @@ class ParallelReduce, static_cast(functor), static_cast( reducer_wrapper.get_functor())); - typename Analysis::Reducer final_reducer(&selected_reducer); + typename Analysis::Reducer final_reducer(selected_reducer); if constexpr (Analysis::StaticValueSize == 0) { reference_type update = diff --git a/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index afdecd2f05..0d71437060 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -58,29 +58,20 @@ class ParallelFor, : m_iter(arg_policy, arg_functor) {} }; -template -class ParallelReduce, ReducerType, - Kokkos::Serial> { +template +class ParallelReduce, Kokkos::Serial> { private: using MDRangePolicy = Kokkos::MDRangePolicy; using Policy = typename MDRangePolicy::impl_range_policy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; using WorkTag = typename MDRangePolicy::work_tag; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - std::conditional_t::value, WorkTag, - void>; - - using Analysis = FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using value_type = typename Analysis::value_type; - using reference_type = typename Analysis::reference_type; + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; using iterate_type = typename Kokkos::Impl::HostIterateTile, ReducerType, return 1024; } inline void execute() const { - const size_t pool_reduce_size = Analysis::value_size( - ReducerConditional::select(m_iter.m_func, m_reducer)); + const size_t pool_reduce_size = m_reducer.value_size(); const size_t team_reduce_size = 0; // Never shrinks const size_t team_shared_size = 0; // Never shrinks const size_t thread_local_size = 0; // Never shrinks @@ -128,45 +118,28 @@ class ParallelReduce, ReducerType, : pointer_type( internal_instance->m_thread_team_data.pool_reduce_local()); - typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_iter.m_func, m_reducer)); - - reference_type update = final_reducer.init(ptr); + reference_type update = m_reducer.init(ptr); this->exec(update); - final_reducer.final(ptr); + m_reducer.final(ptr); } - template - ParallelReduce(const FunctorType& arg_functor, + template + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, const MDRangePolicy& arg_policy, - const HostViewType& arg_result_view, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = nullptr) - : m_iter(arg_policy, arg_functor), - m_reducer(InvalidType()), + const ViewType& arg_result_view) + : m_iter(arg_policy, arg_functor_reducer.get_functor()), + m_reducer(arg_functor_reducer.get_reducer()), m_result_ptr(arg_result_view.data()) { - static_assert(Kokkos::is_view::value, + static_assert(Kokkos::is_view::value, "Kokkos::Serial reduce result must be a View"); static_assert( - Kokkos::Impl::MemorySpaceAccess::accessible, "Kokkos::Serial reduce result must be a View in HostSpace"); } - - inline ParallelReduce(const FunctorType& arg_functor, - MDRangePolicy arg_policy, const ReducerType& reducer) - : m_iter(arg_policy, arg_functor), - m_reducer(reducer), - m_result_ptr(reducer.view().data()) { - /*static_assert( std::is_same< typename ViewType::memory_space - , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" - );*/ - } }; } // namespace Impl diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index f35c13170b..01089677a2 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -58,31 +58,20 @@ class ParallelFor, Kokkos::Serial> { /*--------------------------------------------------------------------------*/ -template -class ParallelReduce, ReducerType, +template +class ParallelReduce, Kokkos::Serial> { private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - std::conditional_t::value, WorkTag, - void>; + using Policy = Kokkos::RangePolicy; + using WorkTag = typename Policy::work_tag; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using Analysis = - FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; template @@ -90,7 +79,7 @@ class ParallelReduce, ReducerType, reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { - m_functor(i, update); + m_functor_reducer.get_functor()(i, update); } } @@ -101,14 +90,14 @@ class ParallelReduce, ReducerType, const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { - m_functor(t, i, update); + m_functor_reducer.get_functor()(t, i, update); } } public: inline void execute() const { const size_t pool_reduce_size = - Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + m_functor_reducer.get_reducer().value_size(); const size_t team_reduce_size = 0; // Never shrinks const size_t team_shared_size = 0; // Never shrinks const size_t thread_local_size = 0; // Never shrinks @@ -127,46 +116,27 @@ class ParallelReduce, ReducerType, : pointer_type( internal_instance->m_thread_team_data.pool_reduce_local()); - typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); - - reference_type update = final_reducer.init(ptr); + reference_type update = m_functor_reducer.get_reducer().init(ptr); this->template exec(update); - final_reducer.final(ptr); + m_functor_reducer.get_reducer().final(ptr); } - template - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const HostViewType& arg_result_view, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = nullptr) - : m_functor(arg_functor), + template + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result_view) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result_view.data()) { - static_assert(Kokkos::is_view::value, + static_assert(Kokkos::is_view::value, "Kokkos::Serial reduce result must be a View"); static_assert( - Kokkos::Impl::MemorySpaceAccess::accessible, "Kokkos::Serial reduce result must be a View in HostSpace"); } - - inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, - const ReducerType& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()) { - /*static_assert( std::is_same< typename ViewType::memory_space - , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" - );*/ - } }; /*--------------------------------------------------------------------------*/ @@ -221,7 +191,7 @@ class ParallelScan, pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); reference_type update = final_reducer.init(pointer_type( internal_instance->m_thread_team_data.pool_reduce_local())); @@ -286,7 +256,7 @@ class ParallelScanWithTotal, pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); - typename Analysis::Reducer final_reducer(&m_functor); + typename Analysis::Reducer final_reducer(m_functor); reference_type update = final_reducer.init(pointer_type( internal_instance->m_thread_team_data.pool_reduce_local())); diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index c5156f1f7f..8b9a9349ab 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -268,35 +268,25 @@ class ParallelFor, /*--------------------------------------------------------------------------*/ -template -class ParallelReduce, - ReducerType, Kokkos::Serial> { +template +class ParallelReduce, Kokkos::Serial> { private: - enum { TEAM_REDUCE_SIZE = 512 }; + static constexpr int TEAM_REDUCE_SIZE = 512; - using Policy = TeamPolicyInternal; + using Policy = TeamPolicyInternal; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; using Member = typename Policy::member_type; using WorkTag = typename Policy::work_tag; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - std::conditional_t::value, WorkTag, - void>; - - using Analysis = - FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; const int m_league; - const ReducerType m_reducer; pointer_type m_result_ptr; size_t m_shared; @@ -304,7 +294,7 @@ class ParallelReduce, inline std::enable_if_t::value> exec( HostThreadTeamData& data, reference_type update) const { for (int ileague = 0; ileague < m_league; ++ileague) { - m_functor(Member(data, ileague, m_league), update); + m_functor_reducer.get_functor()(Member(data, ileague, m_league), update); } } @@ -314,14 +304,15 @@ class ParallelReduce, const TagType t{}; for (int ileague = 0; ileague < m_league; ++ileague) { - m_functor(t, Member(data, ileague, m_league), update); + m_functor_reducer.get_functor()(t, Member(data, ileague, m_league), + update); } } public: inline void execute() const { const size_t pool_reduce_size = - Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + m_functor_reducer.get_reducer().value_size(); const size_t team_reduce_size = TEAM_REDUCE_SIZE; const size_t team_shared_size = m_shared; @@ -341,29 +332,23 @@ class ParallelReduce, : pointer_type( internal_instance->m_thread_team_data.pool_reduce_local()); - typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); - - reference_type update = final_reducer.init(ptr); + reference_type update = m_functor_reducer.get_reducer().init(ptr); this->template exec(internal_instance->m_thread_team_data, update); - final_reducer.final(ptr); + m_functor_reducer.get_reducer().final(ptr); } template - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ViewType& arg_result, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = nullptr) - : m_functor(arg_functor), + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), m_league(arg_policy.league_size()), - m_reducer(InvalidType()), m_result_ptr(arg_result.data()), m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value(m_functor, 1)) { + FunctorTeamShmemSize::value( + m_functor_reducer.get_functor(), 1)) { static_assert(Kokkos::is_view::value, "Reduction result on Kokkos::Serial must be a Kokkos::View"); @@ -373,21 +358,6 @@ class ParallelReduce, "Reduction result on Kokkos::Serial must be a Kokkos::View in " "HostSpace"); } - - inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, - const ReducerType& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_league(arg_policy.league_size()), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value(arg_functor, 1)) { - /*static_assert( std::is_same< typename ViewType::memory_space - , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" - );*/ - } }; } // namespace Impl diff --git a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp b/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp index 35392e3bfb..90c46248b8 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp @@ -165,7 +165,7 @@ class ParallelReduce, ReducerType, exec.pool_rank(), exec.pool_size()); typename Analysis::Reducer reducer( - &ReducerConditional::select(self.m_iter.m_func, self.m_reducer)); + ReducerConditional::select(self.m_iter.m_func, self.m_reducer)); self.exec_range( range.begin(), range.end(), @@ -189,7 +189,7 @@ class ParallelReduce, ReducerType, long work_index = exec.get_work_index(); typename Analysis::Reducer reducer( - &ReducerConditional::select(self.m_iter.m_func, self.m_reducer)); + ReducerConditional::select(self.m_iter.m_func, self.m_reducer)); reference_type update = reducer.init(static_cast(exec.reduce_memory())); diff --git a/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp b/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp index 7d3527facd..c8d28e8fe3 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp @@ -183,7 +183,7 @@ class ParallelReduce, ReducerType, const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); typename Analysis::Reducer reducer( - &ReducerConditional::select(self.m_functor, self.m_reducer)); + ReducerConditional::select(self.m_functor, self.m_reducer)); ParallelReduce::template exec_range( self.m_functor, range.begin(), range.end(), @@ -206,7 +206,7 @@ class ParallelReduce, ReducerType, long work_index = exec.get_work_index(); typename Analysis::Reducer reducer( - &ReducerConditional::select(self.m_functor, self.m_reducer)); + ReducerConditional::select(self.m_functor, self.m_reducer)); reference_type update = reducer.init(static_cast(exec.reduce_memory())); @@ -231,7 +231,7 @@ class ParallelReduce, ReducerType, if (m_policy.end() <= m_policy.begin()) { if (m_result_ptr) { typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); final_reducer.init(m_result_ptr); final_reducer.final(m_result_ptr); } @@ -337,7 +337,7 @@ class ParallelScan, const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); - typename Analysis::Reducer final_reducer(&self.m_functor); + typename Analysis::Reducer final_reducer(self.m_functor); reference_type update = final_reducer.init(static_cast(exec.reduce_memory())); @@ -417,7 +417,7 @@ class ParallelScanWithTotal, const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); - typename Analysis::Reducer final_reducer(&self.m_functor); + typename Analysis::Reducer final_reducer(self.m_functor); reference_type update = final_reducer.init(static_cast(exec.reduce_memory())); diff --git a/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp b/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp index a602078c52..28ee3551ef 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp @@ -161,7 +161,7 @@ class ParallelReduce, const ParallelReduce &self = *((const ParallelReduce *)arg); typename Analysis::Reducer reducer( - &ReducerConditional::select(self.m_functor, self.m_reducer)); + ReducerConditional::select(self.m_functor, self.m_reducer)); ParallelReduce::template exec_team( self.m_functor, Member(&exec, self.m_policy, self.m_shared), @@ -175,7 +175,7 @@ class ParallelReduce, if (m_policy.league_size() * m_policy.team_size() == 0) { if (m_result_ptr) { typename Analysis::Reducer final_reducer( - &ReducerConditional::select(m_functor, m_reducer)); + ReducerConditional::select(m_functor, m_reducer)); final_reducer.init(m_result_ptr); final_reducer.final(m_result_ptr); } diff --git a/core/src/impl/Kokkos_FunctorAnalysis.hpp b/core/src/impl/Kokkos_FunctorAnalysis.hpp index 5ef017a004..4bd6c79c82 100644 --- a/core/src/impl/Kokkos_FunctorAnalysis.hpp +++ b/core/src/impl/Kokkos_FunctorAnalysis.hpp @@ -320,13 +320,15 @@ struct FunctorAnalysis { private: template - KOKKOS_INLINE_FUNCTION static constexpr std::enable_if_t + KOKKOS_INLINE_FUNCTION static constexpr std::enable_if_t get_length(FF const& f) { return f.value_count; } template - KOKKOS_INLINE_FUNCTION static constexpr std::enable_if_t + KOKKOS_INLINE_FUNCTION static constexpr std::enable_if_t get_length(FF const&) { return candidate_is_void ? 0 : 1; } @@ -337,12 +339,12 @@ struct FunctorAnalysis { !candidate_is_void && !candidate_is_array ? sizeof(ValueType) : 0 }; - KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned value_count( + KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned int value_count( const Functor& f) { return FunctorAnalysis::template get_length(f); } - KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned value_size( + KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned int value_size( const Functor& f) { return FunctorAnalysis::template get_length(f) * sizeof(ValueType); @@ -351,13 +353,13 @@ struct FunctorAnalysis { //---------------------------------------- template - KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned value_count( + KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned int value_count( const Unknown&) { return candidate_is_void ? 0 : 1; } template - KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned value_size( + KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned int value_size( const Unknown&) { return candidate_is_void ? 0 : sizeof(ValueType); } @@ -903,12 +905,12 @@ struct FunctorAnalysis { struct Reducer { private: - Functor const* const m_functor; + Functor m_functor; template KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() const noexcept { - return m_functor->value_count; + return m_functor.value_count; } template @@ -924,6 +926,28 @@ struct FunctorAnalysis { using reference_type = FunctorAnalysis::reference_type; using functor_type = Functor; // Adapts a functor + static constexpr bool has_join_member_function() { + return DeduceJoin<>::value; + } + static constexpr bool has_init_member_function() { + return DeduceInit<>::value; + } + static constexpr bool has_final_member_function() { + return DeduceFinal<>::value; + } + + KOKKOS_FUNCTION unsigned int value_size() const { + return FunctorAnalysis::value_size(m_functor); + } + + KOKKOS_FUNCTION unsigned int value_count() const { + return FunctorAnalysis::value_count(m_functor); + } + + KOKKOS_FUNCTION static constexpr unsigned int static_value_size() { + return StaticValueSize; + } + template KOKKOS_INLINE_FUNCTION static std::enable_if_t reference(ValueType* dst) noexcept { @@ -948,20 +972,23 @@ struct FunctorAnalysis { KOKKOS_INLINE_FUNCTION void join(ValueType* dst, ValueType const* src) const noexcept { - DeduceJoin<>::join(m_functor, dst, src); + DeduceJoin<>::join(&m_functor, dst, src); } KOKKOS_INLINE_FUNCTION reference_type init(ValueType* const dst) const noexcept { - DeduceInit<>::init(m_functor, dst); + DeduceInit<>::init(&m_functor, dst); return reference(dst); } KOKKOS_INLINE_FUNCTION void final(ValueType* dst) const noexcept { - DeduceFinal<>::final(m_functor, dst); + DeduceFinal<>::final(&m_functor, dst); } + KOKKOS_INLINE_FUNCTION + const Functor& get_functor() const { return m_functor; } + Reducer(Reducer const&) = default; Reducer(Reducer&&) = default; Reducer& operator=(Reducer const&) = delete; @@ -969,7 +996,7 @@ struct FunctorAnalysis { ~Reducer() = default; KOKKOS_INLINE_FUNCTION explicit constexpr Reducer( - Functor const* arg_functor) noexcept + Functor const& arg_functor) noexcept : m_functor(arg_functor) {} }; }; diff --git a/core/src/impl/Kokkos_Tools_Generic.hpp b/core/src/impl/Kokkos_Tools_Generic.hpp index 4ccb64ce4f..3d88da8f02 100644 --- a/core/src/impl/Kokkos_Tools_Generic.hpp +++ b/core/src/impl/Kokkos_Tools_Generic.hpp @@ -18,6 +18,7 @@ #define KOKKOS_IMPL_KOKKOS_TOOLS_GENERIC_HPP #include +#include #include #include @@ -99,9 +100,12 @@ struct SimpleTeamSizeCalculator { const Functor& functor, const Kokkos::ParallelReduceTag&) { using exec_space = typename Policy::execution_space; - using driver = - Kokkos::Impl::ParallelReduce; + using analysis = Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, Functor>; + using driver = typename Kokkos::Impl::ParallelReduceWrapper< + Kokkos::Impl::CombinedFunctorReducer, + Policy, exec_space>::wrapped_type; return driver::max_tile_size_product(policy, functor); } }; @@ -120,7 +124,12 @@ struct ComplexReducerSizeCalculator { using value_type = typename ReducerType::value_type; value_type value; ReducerType reducer_example = ReducerType(value); - return policy.team_size_max(functor, reducer_example, tag); + + using Analysis = Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType>; + typename Analysis::Reducer final_reducer(reducer_example); + + return policy.team_size_max(functor, final_reducer, tag); } template int get_recommended_team_size(const Policy& policy, const Functor& functor, @@ -128,15 +137,24 @@ struct ComplexReducerSizeCalculator { using value_type = typename ReducerType::value_type; value_type value; ReducerType reducer_example = ReducerType(value); - return policy.team_size_recommended(functor, reducer_example, tag); + + using Analysis = Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType>; + typename Analysis::Reducer final_reducer(reducer_example); + + return policy.team_size_recommended(functor, final_reducer, tag); } template int get_mdrange_max_tile_size_product(const Policy& policy, const Functor& functor, const Kokkos::ParallelReduceTag&) { using exec_space = typename Policy::execution_space; - using driver = - Kokkos::Impl::ParallelReduce; + using Analysis = Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType>; + using driver = typename Kokkos::Impl::ParallelReduceWrapper< + Kokkos::Impl::CombinedFunctorReducer, + Policy, exec_space>::wrapped_type; return driver::max_tile_size_product(policy, functor); } }; diff --git a/core/unit_test/TestFunctorAnalysis.hpp b/core/unit_test/TestFunctorAnalysis.hpp index 9ebb9c066a..414f1e5d37 100644 --- a/core/unit_test/TestFunctorAnalysis.hpp +++ b/core/unit_test/TestFunctorAnalysis.hpp @@ -59,7 +59,7 @@ void test_functor_analysis() { static_assert(!A01::has_init_member_function, ""); static_assert(!A01::has_final_member_function, ""); static_assert(A01::StaticValueSize == 0, ""); - ASSERT_EQ(R01(&c01).length(), 0); + ASSERT_EQ(R01(c01).length(), 0); //------------------------------ auto c02 = KOKKOS_LAMBDA(int, double&){}; @@ -78,7 +78,7 @@ void test_functor_analysis() { static_assert(!A02::has_init_member_function, ""); static_assert(!A02::has_final_member_function, ""); static_assert(A02::StaticValueSize == sizeof(double), ""); - ASSERT_EQ(R02(&c02).length(), 1); + ASSERT_EQ(R02(c02).length(), 1); //------------------------------ @@ -106,7 +106,7 @@ void test_functor_analysis() { static_assert(!A03::has_final_member_function, ""); static_assert( A03::StaticValueSize == sizeof(TestFunctorAnalysis_03::value_type), ""); - ASSERT_EQ(R03(&c03).length(), 1); + ASSERT_EQ(R03(c03).length(), 1); //------------------------------ } diff --git a/core/unit_test/hip/TestHIP_ScanUnit.cpp b/core/unit_test/hip/TestHIP_ScanUnit.cpp index 0b46d9742a..23c287635d 100644 --- a/core/unit_test/hip/TestHIP_ScanUnit.cpp +++ b/core/unit_test/hip/TestHIP_ScanUnit.cpp @@ -33,7 +33,7 @@ __global__ void start_intra_block_scan() DummyFunctor f; typename Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::SCAN, - Kokkos::RangePolicy, DummyFunctor>::Reducer reducer(&f); + Kokkos::RangePolicy, DummyFunctor>::Reducer reducer(f); Kokkos::Impl::hip_intra_block_reduce_scan(reducer, values); __syncthreads(); diff --git a/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp b/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp index 4235c73c8e..8175c23565 100644 --- a/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp +++ b/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp @@ -45,7 +45,7 @@ struct NonTrivialReduceFunctor { NonTrivialReduceFunctor(NonTrivialReduceFunctor &&) = default; NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor &&) = default; NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor const &) = default; - ~NonTrivialReduceFunctor() {} + KOKKOS_FUNCTION ~NonTrivialReduceFunctor() {} }; template From 0691619fd1646e3400051f1bd126736bcd000cea Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 20 Feb 2023 13:55:28 -0500 Subject: [PATCH 248/496] Convert HPX ParallelReduce --- core/src/HPX/Kokkos_HPX.hpp | 251 +++++++++++----------------- core/src/Kokkos_Parallel_Reduce.hpp | 6 + 2 files changed, 100 insertions(+), 157 deletions(-) diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp index dff7c53adb..53b4ac1da3 100644 --- a/core/src/HPX/Kokkos_HPX.hpp +++ b/core/src/HPX/Kokkos_HPX.hpp @@ -992,81 +992,73 @@ class ParallelFor, namespace Kokkos { namespace Impl { -template -class ParallelReduce, ReducerType, +template +class ParallelReduce, Kokkos::Experimental::HPX> { private: - using Policy = Kokkos::RangePolicy; + using Policy = Kokkos::RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + using WorkTag = typename Policy::work_tag; using WorkRange = typename Policy::WorkRange; using Member = typename Policy::member_type; - using ReducerConditional = - Kokkos::Impl::if_c, FunctorType, - ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using Analysis = - FunctorAnalysis; - using value_type = typename Analysis::value_type; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - const FunctorType m_functor; + using value_type = typename ReducerType::value_type; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; const bool m_force_synchronous; public: void setup() const { - const std::size_t value_size = - Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + const ReducerType &reducer = m_functor_reducer.get_reducer(); + const std::size_t value_size = reducer.value_size(); const int num_worker_threads = m_policy.space().concurrency(); hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); buffer.resize(num_worker_threads, value_size); - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); - for (int t = 0; t < num_worker_threads; ++t) { - final_reducer.init(reinterpret_cast(buffer.get(t))); + reducer.init(reinterpret_cast(buffer.get(t))); } } void execute_range(const Member i_chunk) const { hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); reference_type update = - Analysis::Reducer::reference(reinterpret_cast( + ReducerType::reference(reinterpret_cast( buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()))); const auto r = get_chunk_range(i_chunk, m_policy.begin(), m_policy.chunk_size(), m_policy.end()); for (Member i = r.begin; i < r.end; ++i) { if constexpr (std::is_same_v) { - m_functor(i, update); + m_functor_reducer.get_functor()(i, update); } else { - m_functor(WorkTag{}, i, update); + m_functor_reducer.get_functor()(WorkTag{}, i, update); } } } void finalize() const { - hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + const ReducerType &reducer = m_functor_reducer.get_reducer(); const int num_worker_threads = m_policy.space().concurrency(); for (int i = 1; i < num_worker_threads; ++i) { - final_reducer.join(reinterpret_cast(buffer.get(0)), - reinterpret_cast(buffer.get(i))); + reducer.join(reinterpret_cast(buffer.get(0)), + reinterpret_cast(buffer.get(i))); } pointer_type final_value_ptr = reinterpret_cast(buffer.get(0)); - final_reducer.final(final_value_ptr); + reducer.final(final_value_ptr); if (m_result_ptr != nullptr) { - const int n = Analysis::value_count( - ReducerConditional::select(m_functor, m_reducer)); + const int n = reducer.value_count(); for (int j = 0; j < n; ++j) { m_result_ptr[j] = final_value_ptr[j]; @@ -1077,11 +1069,9 @@ class ParallelReduce, ReducerType, void execute() const { if (m_policy.end() <= m_policy.begin()) { if (m_result_ptr) { - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); - - final_reducer.init(m_result_ptr); - final_reducer.final(m_result_ptr); + const ReducerType &reducer = m_functor_reducer.get_reducer(); + reducer.init(m_result_ptr); + reducer.final(m_result_ptr); } return; } @@ -1094,76 +1084,59 @@ class ParallelReduce, ReducerType, } template - inline ParallelReduce( - const FunctorType &arg_functor, Policy arg_policy, - const ViewType &arg_view, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void *> = nullptr) - : m_functor(arg_functor), + inline ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer, + Policy arg_policy, const ViewType &arg_view) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_view.data()), m_force_synchronous(!arg_view.impl_track().has_record()) {} - - inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy, - const ReducerType &reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_force_synchronous(!reducer.view().impl_track().has_record()) {} }; -template -class ParallelReduce, ReducerType, +template +class ParallelReduce, Kokkos::Experimental::HPX> { private: using MDRangePolicy = Kokkos::MDRangePolicy; - using Policy = typename MDRangePolicy::impl_range_policy; - using WorkTag = typename MDRangePolicy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; - using ReducerConditional = - Kokkos::Impl::if_c, FunctorType, - ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using Analysis = FunctorAnalysis; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using pointer_type = typename Analysis::pointer_type; - using value_type = typename Analysis::value_type; - using reference_type = typename Analysis::reference_type; + using Policy = typename MDRangePolicy::impl_range_policy; + using WorkTag = typename MDRangePolicy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; using iterate_type = typename Kokkos::Impl::HostIterateTile; const iterate_type m_iter; const Policy m_policy; - const ReducerType m_reducer; + const CombinedFunctorReducerType m_functor_reducer; const pointer_type m_result_ptr; const bool m_force_synchronous; public: void setup() const { - const std::size_t value_size = Analysis::value_size( - ReducerConditional::select(m_iter.m_func, m_reducer)); + const ReducerType &reducer = m_functor_reducer.get_reducer(); + const std::size_t value_size = reducer.value_size(); const int num_worker_threads = m_policy.space().concurrency(); - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_iter.m_func, m_reducer)); hpx_thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer(); buffer.resize(num_worker_threads, value_size); for (int t = 0; t < num_worker_threads; ++t) { - final_reducer.init(reinterpret_cast(buffer.get(t))); + reducer.init(reinterpret_cast(buffer.get(t))); } } void execute_range(const Member i_chunk) const { hpx_thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer(); reference_type update = - Analysis::Reducer::reference(reinterpret_cast( + ReducerType::reference(reinterpret_cast( buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()))); const auto r = get_chunk_range(i_chunk, m_policy.begin(), m_policy.chunk_size(), m_policy.end()); @@ -1173,23 +1146,21 @@ class ParallelReduce, ReducerType, } void finalize() const { - hpx_thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer(); - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_iter.m_func, m_reducer)); + hpx_thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer(); + ReducerType reducer = m_functor_reducer.get_reducer(); const int num_worker_threads = m_policy.space().concurrency(); for (int i = 1; i < num_worker_threads; ++i) { - final_reducer.join(reinterpret_cast(buffer.get(0)), - reinterpret_cast(buffer.get(i))); + reducer.join(reinterpret_cast(buffer.get(0)), + reinterpret_cast(buffer.get(i))); } pointer_type final_value_ptr = reinterpret_cast(buffer.get(0)); - final_reducer.final(final_value_ptr); + reducer.final(final_value_ptr); if (m_result_ptr != nullptr) { - const int n = Analysis::value_count( - ReducerConditional::select(m_iter.m_func, m_reducer)); + const int n = reducer.value_count(); for (int j = 0; j < n; ++j) { m_result_ptr[j] = final_value_ptr[j]; @@ -1206,25 +1177,14 @@ class ParallelReduce, ReducerType, } template - inline ParallelReduce( - const FunctorType &arg_functor, MDRangePolicy arg_policy, - const ViewType &arg_view, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void *> = nullptr) - : m_iter(arg_policy, arg_functor), + inline ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer, + MDRangePolicy arg_policy, const ViewType &arg_view) + : m_iter(arg_policy, arg_functor_reducer.get_functor()), m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)), - m_reducer(InvalidType()), + m_functor_reducer(arg_functor_reducer), m_result_ptr(arg_view.data()), m_force_synchronous(!arg_view.impl_track().has_record()) {} - inline ParallelReduce(const FunctorType &arg_functor, - MDRangePolicy arg_policy, const ReducerType &reducer) - : m_iter(arg_policy, arg_functor), - m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_force_synchronous(!reducer.view().impl_track().has_record()) {} template static int max_tile_size_product(const Policy &, const Functor &) { /** @@ -1520,84 +1480,78 @@ class ParallelFor, arg_functor, arg_policy.team_size())) {} }; -template -class ParallelReduce, - ReducerType, Kokkos::Experimental::HPX> { +template +class ParallelReduce, + Kokkos::Experimental::HPX> { private: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicyInternal; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + using Member = typename Policy::member_type; using WorkTag = typename Policy::work_tag; - using ReducerConditional = - Kokkos::Impl::if_c, FunctorType, - ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using Analysis = - FunctorAnalysis; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - using value_type = typename Analysis::value_type; - const FunctorType m_functor; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + using value_type = typename ReducerType::value_type; + + const CombinedFunctorReducerType m_functor_reducer; const int m_league; const Policy m_policy; - const ReducerType m_reducer; pointer_type m_result_ptr; const std::size_t m_shared; const bool m_force_synchronous; public: void setup() const { - const std::size_t value_size = - Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + const ReducerType &reducer = m_functor_reducer.get_reducer(); + const std::size_t value_size = reducer.value_size(); const int num_worker_threads = m_policy.space().concurrency(); hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); buffer.resize(num_worker_threads, value_size + m_shared); - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); for (int t = 0; t < num_worker_threads; ++t) { - final_reducer.init(reinterpret_cast(buffer.get(t))); + reducer.init(reinterpret_cast(buffer.get(t))); } } void execute_range(const int i) const { - const std::size_t value_size = - Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + const ReducerType &reducer = m_functor_reducer.get_reducer(); + const std::size_t value_size = reducer.value_size(); std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id(); hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); - reference_type update = Analysis::Reducer::reference( - reinterpret_cast(buffer.get(t))); + reference_type update = + ReducerType::reference(reinterpret_cast(buffer.get(t))); const auto r = get_chunk_range(i, 0, m_policy.chunk_size(), m_policy.league_size()); char *local_buffer = static_cast(buffer.get(t)) + value_size; for (int league_rank = r.begin; league_rank < r.end; ++league_rank) { if constexpr (std::is_same_v) { - m_functor(Member(m_policy, 0, league_rank, local_buffer, m_shared), - update); + m_functor_reducer.get_functor()( + Member(m_policy, 0, league_rank, local_buffer, m_shared), update); } else { - m_functor(WorkTag{}, - Member(m_policy, 0, league_rank, local_buffer, m_shared), - update); + m_functor_reducer.get_functor()( + WorkTag{}, Member(m_policy, 0, league_rank, local_buffer, m_shared), + update); } } } void finalize() const { - hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); + hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer(); + const ReducerType &reducer = m_functor_reducer.get_reducer(); const int num_worker_threads = m_policy.space().concurrency(); const pointer_type ptr = reinterpret_cast(buffer.get(0)); for (int t = 1; t < num_worker_threads; ++t) { - final_reducer.join(ptr, reinterpret_cast(buffer.get(t))); + reducer.join(ptr, reinterpret_cast(buffer.get(t))); } - final_reducer.final(ptr); + reducer.final(ptr); if (m_result_ptr) { - const int n = Analysis::value_count( - ReducerConditional::select(m_functor, m_reducer)); + const int n = reducer.value_count(); for (int j = 0; j < n; ++j) { m_result_ptr[j] = ptr[j]; @@ -1608,10 +1562,9 @@ class ParallelReduce, void execute() const { if (m_policy.league_size() * m_policy.team_size() == 0) { if (m_result_ptr) { - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); - final_reducer.init(m_result_ptr); - final_reducer.final(m_result_ptr); + const ReducerType &reducer = m_functor_reducer.get_reducer(); + reducer.init(m_result_ptr); + reducer.final(m_result_ptr); } return; } @@ -1624,32 +1577,16 @@ class ParallelReduce, } template - ParallelReduce(const FunctorType &arg_functor, const Policy &arg_policy, - const ViewType &arg_result, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void *> = nullptr) - : m_functor(arg_functor), + ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer, + const Policy &arg_policy, const ViewType &arg_result) + : m_functor_reducer(arg_functor_reducer), m_league(arg_policy.league_size()), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result.data()), m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize::value( - m_functor, arg_policy.team_size())), + m_functor_reducer.get_functor(), arg_policy.team_size())), m_force_synchronous(!arg_result.impl_track().has_record()) {} - - inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy, - const ReducerType &reducer) - : m_functor(arg_functor), - m_league(arg_policy.league_size()), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, arg_policy.team_size())), - m_force_synchronous(!reducer.view().impl_track().has_record()) {} }; } // namespace Impl } // namespace Kokkos diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index 6b1fb55b7f..3e48380a53 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -1421,6 +1421,12 @@ template <> struct implements_new_reduce_interface : std::true_type {}; #endif +#ifdef KOKKOS_ENABLE_HPX +template <> +struct implements_new_reduce_interface + : std::true_type {}; +#endif + #ifdef KOKKOS_ENABLE_CUDA template <> struct implements_new_reduce_interface : std::true_type {}; From fcdedf75a438009e28be5827670190db17001167 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 23 Feb 2023 17:41:41 -0500 Subject: [PATCH 249/496] Do not bother with sycl::rotate --- core/src/Kokkos_BitManipulation.hpp | 34 ++--------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index 1f173224e2..b54be3b301 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -304,34 +304,6 @@ KOKKOS_IMPL_HOST_FUNCTION #undef KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS -template -KOKKOS_FUNCTION T rotl_builtin_host(T x, int s) noexcept { - return rotl(x, s); -} - -template -KOKKOS_FUNCTION T rotl_builtin_device(T x, int s) noexcept { -#ifdef KOKKOS_ENABLE_SYCL - return sycl::rotate(x, s); -#else - return rotl(x, s); -#endif -} - -template -KOKKOS_FUNCTION T rotr_builtin_host(T x, int s) noexcept { - return rotr(x, s); -} - -template -KOKKOS_FUNCTION T rotr_builtin_device(T x, int s) noexcept { -#ifdef KOKKOS_ENABLE_SYCL - return sycl::rotate(x, -s); -#else - return rotr(x, s); -#endif -} - } // namespace Kokkos::Impl namespace Kokkos::Experimental { @@ -411,16 +383,14 @@ template [[nodiscard]] KOKKOS_FUNCTION std::enable_if_t<::Kokkos::Impl::is_standard_unsigned_integer_type_v, T> rotl_builtin(T x, int s) noexcept { - KOKKOS_IF_ON_DEVICE((return ::Kokkos::Impl::rotl_builtin_device(x, s);)) - KOKKOS_IF_ON_HOST((return ::Kokkos::Impl::rotl_builtin_host(x, s);)) + return rotl(x, s); // no benefit to call the _builtin variant } template [[nodiscard]] KOKKOS_FUNCTION std::enable_if_t<::Kokkos::Impl::is_standard_unsigned_integer_type_v, T> rotr_builtin(T x, int s) noexcept { - KOKKOS_IF_ON_DEVICE((return ::Kokkos::Impl::rotr_builtin_device(x, s);)) - KOKKOS_IF_ON_HOST((return ::Kokkos::Impl::rotr_builtin_host(x, s);)) + return rotr(x, s); // no benefit to call the _builtin variant } } // namespace Kokkos::Experimental From b4655f90fa4b228c78f254871b0de5652ea16877 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Sun, 29 Jan 2023 20:36:06 -0500 Subject: [PATCH 250/496] Drop (unused) HBW lock array --- core/src/Kokkos_HBWSpace.hpp | 35 ----------------------------- core/src/impl/Kokkos_HBWSpace.cpp | 37 ------------------------------- 2 files changed, 72 deletions(-) diff --git a/core/src/Kokkos_HBWSpace.hpp b/core/src/Kokkos_HBWSpace.hpp index d9064a2983..369b7bafb7 100644 --- a/core/src/Kokkos_HBWSpace.hpp +++ b/core/src/Kokkos_HBWSpace.hpp @@ -31,41 +31,6 @@ namespace Kokkos { namespace Experimental { -namespace Impl { - -/// \brief Initialize lock array for arbitrary size atomics. -/// -/// Arbitrary atomics are implemented using a hash table of locks -/// where the hash value is derived from the address of the -/// object for which an atomic operation is performed. -/// This function initializes the locks to zero (unset). -void init_lock_array_hbw_space(); - -/// \brief Acquire a lock for the address -/// -/// This function tries to acquire the lock for the hash value derived -/// from the provided ptr. If the lock is successfully acquired the -/// function returns true. Otherwise it returns false. -bool lock_address_hbw_space(void* ptr); - -/// \brief Release lock for the address -/// -/// This function releases the lock for the hash value derived -/// from the provided ptr. This function should only be called -/// after previously successfully acquiring a lock with -/// lock_address. -void unlock_address_hbw_space(void* ptr); - -} // namespace Impl - -} // namespace Experimental - -} // namespace Kokkos - -namespace Kokkos { - -namespace Experimental { - /// \class HBWSpace /// \brief Memory management for host memory. /// diff --git a/core/src/impl/Kokkos_HBWSpace.cpp b/core/src/impl/Kokkos_HBWSpace.cpp index 7402f1a744..cd640b88cb 100644 --- a/core/src/impl/Kokkos_HBWSpace.cpp +++ b/core/src/impl/Kokkos_HBWSpace.cpp @@ -310,41 +310,4 @@ void SharedAllocationRecord:: } // namespace Impl } // namespace Kokkos -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Experimental { -namespace { -const unsigned HBW_SPACE_ATOMIC_MASK = 0xFFFF; -const unsigned HBW_SPACE_ATOMIC_XOR_MASK = 0x5A39; -static int HBW_SPACE_ATOMIC_LOCKS[HBW_SPACE_ATOMIC_MASK + 1]; -} // namespace - -namespace Impl { -void init_lock_array_hbw_space() { - static int is_initialized = 0; - if (!is_initialized) - for (int i = 0; i < static_cast(HBW_SPACE_ATOMIC_MASK + 1); i++) - HBW_SPACE_ATOMIC_LOCKS[i] = 0; -} - -bool lock_address_hbw_space(void *ptr) { - return 0 == atomic_compare_exchange( - &HBW_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & - HBW_SPACE_ATOMIC_MASK) ^ - HBW_SPACE_ATOMIC_XOR_MASK], - 0, 1); -} - -void unlock_address_hbw_space(void *ptr) { - atomic_exchange( - &HBW_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HBW_SPACE_ATOMIC_MASK) ^ - HBW_SPACE_ATOMIC_XOR_MASK], - 0); -} - -} // namespace Impl -} // namespace Experimental -} // namespace Kokkos #endif From 63879dbbc631f537dbf5aab1787d77a62055f129 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Sun, 29 Jan 2023 20:38:10 -0500 Subject: [PATCH 251/496] Drop host lock array --- core/src/Kokkos_HostSpace.hpp | 31 ------------------- core/src/OpenMP/Kokkos_OpenMP_Instance.cpp | 2 -- core/src/Serial/Kokkos_Serial.cpp | 3 -- core/src/Threads/Kokkos_ThreadsExec.cpp | 3 -- core/src/impl/Kokkos_HostSpace.cpp | 36 ---------------------- 5 files changed, 75 deletions(-) diff --git a/core/src/Kokkos_HostSpace.hpp b/core/src/Kokkos_HostSpace.hpp index 4b839aca05..0c8cd43a04 100644 --- a/core/src/Kokkos_HostSpace.hpp +++ b/core/src/Kokkos_HostSpace.hpp @@ -41,37 +41,6 @@ static_assert(false, /*--------------------------------------------------------------------------*/ -namespace Kokkos { - -namespace Impl { - -/// \brief Initialize lock array for arbitrary size atomics. -/// -/// Arbitrary atomics are implemented using a hash table of locks -/// where the hash value is derived from the address of the -/// object for which an atomic operation is performed. -/// This function initializes the locks to zero (unset). -void init_lock_array_host_space(); - -/// \brief Acquire a lock for the address -/// -/// This function tries to acquire the lock for the hash value derived -/// from the provided ptr. If the lock is successfully acquired the -/// function returns true. Otherwise it returns false. -bool lock_address_host_space(void* ptr); - -/// \brief Release lock for the address -/// -/// This function releases the lock for the hash value derived -/// from the provided ptr. This function should only be called -/// after previously successfully acquiring a lock with -/// lock_address. -void unlock_address_host_space(void* ptr); - -} // namespace Impl - -} // namespace Kokkos - namespace Kokkos { /// \class HostSpace /// \brief Memory management for host memory. diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp index e1434a4275..fbf44c0164 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp @@ -324,8 +324,6 @@ void OpenMPInternal::initialize(int thread_count) { std::cerr << " Requested: " << thread_count << " threads per process." << std::endl; } - // Init the array used for arbitrarily sized atomics - init_lock_array_host_space(); m_initialized = true; } diff --git a/core/src/Serial/Kokkos_Serial.cpp b/core/src/Serial/Kokkos_Serial.cpp index b5a1fcdd79..52e3d68bdf 100644 --- a/core/src/Serial/Kokkos_Serial.cpp +++ b/core/src/Serial/Kokkos_Serial.cpp @@ -42,9 +42,6 @@ void SerialInternal::initialize() { Impl::SharedAllocationRecord::tracking_enable(); - // Init the array of locks used for arbitrarily sized atomics - Impl::init_lock_array_host_space(); - m_is_initialized = true; } diff --git a/core/src/Threads/Kokkos_ThreadsExec.cpp b/core/src/Threads/Kokkos_ThreadsExec.cpp index 8f39c726c7..c754091e87 100644 --- a/core/src/Threads/Kokkos_ThreadsExec.cpp +++ b/core/src/Threads/Kokkos_ThreadsExec.cpp @@ -768,9 +768,6 @@ void ThreadsExec::initialize(int thread_count_arg) { << thread_count << " threads per process." << std::endl; } - // Init the array for used for arbitrarily sized atomics - Impl::init_lock_array_host_space(); - Impl::SharedAllocationRecord::tracking_enable(); } diff --git a/core/src/impl/Kokkos_HostSpace.cpp b/core/src/impl/Kokkos_HostSpace.cpp index b47ce3beec..857340ae09 100644 --- a/core/src/impl/Kokkos_HostSpace.cpp +++ b/core/src/impl/Kokkos_HostSpace.cpp @@ -284,42 +284,6 @@ SharedAllocationRecord::SharedAllocationRecord( } // namespace Impl } // namespace Kokkos -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace { -const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF; -const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39; -static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK + 1]; -} // namespace - -namespace Impl { -void init_lock_array_host_space() { - static int is_initialized = 0; - if (!is_initialized) - for (int i = 0; i < static_cast(HOST_SPACE_ATOMIC_MASK + 1); i++) - HOST_SPACE_ATOMIC_LOCKS[i] = 0; -} - -bool lock_address_host_space(void *ptr) { - return 0 == atomic_compare_exchange( - &HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & - HOST_SPACE_ATOMIC_MASK) ^ - HOST_SPACE_ATOMIC_XOR_MASK], - 0, 1); -} - -void unlock_address_host_space(void *ptr) { - atomic_exchange( - &HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^ - HOST_SPACE_ATOMIC_XOR_MASK], - 0); -} - -} // namespace Impl -} // namespace Kokkos - //============================================================================== // {{{1 From cba99e88d2c42512a3775efc49de9f33a4711cdd Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Sun, 29 Jan 2023 20:40:39 -0500 Subject: [PATCH 252/496] Remove misplaced and commented host lock array code in OpenMPTarget backend --- .../OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp | 33 ------------------- .../OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp | 31 ----------------- 2 files changed, 64 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp index de8e629831..81fbc56de0 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp @@ -165,39 +165,6 @@ SharedAllocationRecord:: } // namespace Impl } // namespace Kokkos -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ -/* -namespace Kokkos { -namespace { - const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF; - const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39; - static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1]; -} - -namespace Impl { -void init_lock_array_host_space() { - static int is_initialized = 0; - if(! is_initialized) - for(int i = 0; i < static_cast (HOST_SPACE_ATOMIC_MASK+1); i++) - HOST_SPACE_ATOMIC_LOCKS[i] = 0; -} - -bool lock_address_host_space(void* ptr) { - return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[ - (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ -HOST_SPACE_ATOMIC_XOR_MASK] , 0 , 1); -} - -void unlock_address_host_space(void* ptr) { - atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[ - (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ -HOST_SPACE_ATOMIC_XOR_MASK] , 0); -} - -} -}*/ - //============================================================================== // {{{1 diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp index ca015da379..e5b33d0982 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp @@ -35,37 +35,6 @@ static_assert(false, #include #include -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Impl { - -/// \brief Initialize lock array for arbitrary size atomics. -/// -/// Arbitrary atomics are implemented using a hash table of locks -/// where the hash value is derived from the address of the -/// object for which an atomic operation is performed. -/// This function initializes the locks to zero (unset). -// void init_lock_array_host_space(); - -/// \brief Acquire a lock for the address -/// -/// This function tries to acquire the lock for the hash value derived -/// from the provided ptr. If the lock is successfully acquired the -/// function returns true. Otherwise it returns false. -// bool lock_address_host_space(void* ptr); - -/// \brief Release lock for the address -/// -/// This function releases the lock for the hash value derived -/// from the provided ptr. This function should only be called -/// after previously successfully acquiring a lock with -/// lock_address. -// void unlock_address_host_space(void* ptr); - -} // namespace Impl -} // namespace Kokkos - namespace Kokkos { namespace Impl { From 70a0af5f816187a2e91ed1aab2067d3a5ce332b9 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 21 Feb 2023 10:41:47 -0500 Subject: [PATCH 253/496] Convert HIP ParallelReduce --- core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp | 100 +++----- core/src/HIP/Kokkos_HIP_Parallel_Range.hpp | 120 ++++------ core/src/HIP/Kokkos_HIP_Parallel_Team.hpp | 240 +++++-------------- core/src/Kokkos_Parallel_Reduce.hpp | 5 + 4 files changed, 143 insertions(+), 322 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp index 5f7919fbae..b4423a3aca 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp @@ -165,11 +165,13 @@ class ParallelFor, HIP> { }; // ParallelReduce -template -class ParallelReduce, ReducerType, - HIP> { +template +class ParallelReduce, HIP> { public: - using Policy = Kokkos::MDRangePolicy; + using Policy = Kokkos::MDRangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; private: using array_index_type = typename Policy::array_index_type; @@ -179,31 +181,18 @@ class ParallelReduce, ReducerType, using Member = typename Policy::member_type; using LaunchBounds = typename Policy::launch_bounds; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - typename Kokkos::Impl::if_c::value, - WorkTag, void>::type; - - using Analysis = - Kokkos::Impl::FunctorAnalysis; - public: - using pointer_type = typename Analysis::pointer_type; - using value_type = typename Analysis::value_type; - using reference_type = typename Analysis::reference_type; + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; using functor_type = FunctorType; using size_type = HIP::size_type; // Algorithmic constraints: blockSize is a power of two AND blockDim.y == // blockDim.z == 1 - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; // used for workrange and nwork - const ReducerType m_reducer; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; size_type* m_scratch_space; @@ -214,21 +203,19 @@ class ParallelReduce, ReducerType, public: inline __device__ void exec_range(reference_type update) const { - DeviceIteratePattern(m_policy, m_functor, update).exec_range(); + DeviceIteratePattern(m_policy, m_functor_reducer.get_functor(), update) + .exec_range(); } inline __device__ void operator()() const { - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); + const ReducerType& reducer = m_functor_reducer.get_reducer(); - const integral_nonzero_constant - word_count(Analysis::value_size( - ReducerConditional::select(m_functor, m_reducer)) / - sizeof(size_type)); + const integral_nonzero_constant< + size_type, ReducerType::static_value_size() / sizeof(size_type)> + word_count(reducer.value_size() / sizeof(size_type)); { - reference_type value = final_reducer.init(reinterpret_cast( + reference_type value = reducer.init(reinterpret_cast( kokkos_impl_hip_shared_memory() + threadIdx.y * word_count.value)); @@ -244,7 +231,7 @@ class ParallelReduce, ReducerType, // Reduce with final value at blockDim.y - 1 location. // Problem: non power-of-two blockDim if (::Kokkos::Impl::hip_single_inter_block_reduce_scan( - final_reducer, blockIdx.x, gridDim.x, + reducer, blockIdx.x, gridDim.x, kokkos_impl_hip_shared_memory(), m_scratch_space, m_scratch_flags)) { // This is the final block with the final result at the final threads' @@ -256,7 +243,7 @@ class ParallelReduce, ReducerType, : m_scratch_space; if (threadIdx.y == 0) { - final_reducer.final(reinterpret_cast(shared)); + reducer.final(reinterpret_cast(shared)); } if (Impl::HIPTraits::WarpSize < word_count.value) { @@ -277,10 +264,9 @@ class ParallelReduce, ReducerType, return hip_single_inter_block_reduce_scan_shmem(f, n); }; - using closure_type = ParallelReduce; unsigned block_size = - Kokkos::Impl::hip_get_preferred_blocksize( + Kokkos::Impl::hip_get_preferred_blocksize( instance, shmem_functor); if (block_size == 0) { Kokkos::Impl::throw_runtime_exception( @@ -291,19 +277,17 @@ class ParallelReduce, ReducerType, } inline void execute() { - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); + ReducerType reducer = m_functor_reducer.get_reducer(); - using ClosureType = - ParallelReduce; const auto nwork = m_policy.m_num_tiles; if (nwork) { int block_size = m_policy.m_prod_tile_dims; // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions // Nearest power of two - int exponent_pow_two = std::ceil(std::log2(block_size)); - block_size = std::pow(2, exponent_pow_two); - int suggested_blocksize = local_block_size(m_functor); + int exponent_pow_two = std::ceil(std::log2(block_size)); + block_size = std::pow(2, exponent_pow_two); + int suggested_blocksize = + local_block_size(m_functor_reducer.get_functor()); block_size = (block_size > suggested_blocksize) ? block_size @@ -311,8 +295,7 @@ class ParallelReduce, ReducerType, // than or equal to 512 m_scratch_space = hip_internal_scratch_space( - m_policy.space(), Analysis::value_size(ReducerConditional::select( - m_functor, m_reducer)) * + m_policy.space(), reducer.value_size() * block_size /* block_size == max block_count */); m_scratch_flags = hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); @@ -326,34 +309,31 @@ class ParallelReduce, ReducerType, const int shmem = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< - false, FunctorType, WorkTag>(m_functor, block.y); + false, FunctorType, WorkTag>(m_functor_reducer.get_functor(), + block.y); - hip_parallel_launch( + hip_parallel_launch( *this, grid, block, shmem, m_policy.space().impl_internal_space_instance(), false); // copy to device and execute if (!m_result_ptr_device_accessible && m_result_ptr) { - const int size = Analysis::value_size( - ReducerConditional::select(m_functor, m_reducer)); + const int size = reducer.value_size(); DeepCopy(m_policy.space(), m_result_ptr, m_scratch_space, size); } } else { if (m_result_ptr) { - final_reducer.init(m_result_ptr); + reducer.init(m_result_ptr); } } } template - ParallelReduce( - const FunctorType& arg_functor, const Policy& arg_policy, - const ViewType& arg_result, - std::enable_if_t::value, void*> = nullptr) - : m_functor(arg_functor), + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result.data()), m_result_ptr_device_accessible( MemorySpaceAccess, ReducerType, m_scratch_space(nullptr), m_scratch_flags(nullptr) {} - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ReducerType& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_scratch_space(nullptr), - m_scratch_flags(nullptr) {} - template static int max_tile_size_product(const Policy&, const Functor&) { using closure_type = diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index e27b778c74..a646b9b6f5 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -101,11 +101,13 @@ class ParallelFor, Kokkos::HIP> { //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -template -class ParallelReduce, ReducerType, +template +class ParallelReduce, Kokkos::HIP> { public: - using Policy = Kokkos::RangePolicy; + using Policy = Kokkos::RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; private: using WorkRange = typename Policy::WorkRange; @@ -113,22 +115,10 @@ class ParallelReduce, ReducerType, using Member = typename Policy::member_type; using LaunchBounds = typename Policy::launch_bounds; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - typename Kokkos::Impl::if_c::value, - WorkTag, void>::type; - - using Analysis = - Kokkos::Impl::FunctorAnalysis; - public: - using pointer_type = typename Analysis::pointer_type; - using value_type = typename Analysis::value_type; - using reference_type = typename Analysis::reference_type; + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; using functor_type = FunctorType; using size_type = Kokkos::HIP::size_type; using index_type = typename Policy::index_type; @@ -136,9 +126,8 @@ class ParallelReduce, ReducerType, // Algorithmic constraints: blockSize is a power of two AND blockDim.y == // blockDim.z == 1 - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; const bool m_result_ptr_host_accessible; @@ -146,7 +135,7 @@ class ParallelReduce, ReducerType, size_type* m_scratch_flags = nullptr; static bool constexpr UseShflReduction = - static_cast(Analysis::StaticValueSize); + static_cast(ReducerType::static_value_size()); private: struct ShflReductionTag {}; @@ -156,13 +145,13 @@ class ParallelReduce, ReducerType, template __device__ inline std::enable_if_t::value> exec_range( const Member& i, reference_type update) const { - m_functor(i, update); + m_functor_reducer.get_functor()(i, update); } template __device__ inline std::enable_if_t::value> exec_range( const Member& i, reference_type update) const { - m_functor(TagType(), i, update); + m_functor_reducer.get_functor()(TagType(), i, update); } public: @@ -173,16 +162,13 @@ class ParallelReduce, ReducerType, } __device__ inline void run(SHMEMReductionTag) const { - const integral_nonzero_constant - word_count(Analysis::value_size( - ReducerConditional::select(m_functor, m_reducer)) / - sizeof(size_type)); - - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); + const ReducerType& reducer = m_functor_reducer.get_reducer(); + const integral_nonzero_constant< + size_type, ReducerType::static_value_size() / sizeof(size_type)> + word_count(reducer.value_size() / sizeof(size_type)); + { - reference_type value = final_reducer.init(reinterpret_cast( + reference_type value = reducer.init(reinterpret_cast( ::Kokkos::kokkos_impl_hip_shared_memory() + threadIdx.y * word_count.value)); @@ -205,7 +191,7 @@ class ParallelReduce, ReducerType, bool do_final_reduction = m_policy.begin() == m_policy.end(); if (!do_final_reduction) do_final_reduction = hip_single_inter_block_reduce_scan( - final_reducer, blockIdx.x, gridDim.x, + reducer, blockIdx.x, gridDim.x, ::Kokkos::kokkos_impl_hip_shared_memory(), m_scratch_space, m_scratch_flags); if (do_final_reduction) { @@ -220,7 +206,7 @@ class ParallelReduce, ReducerType, : m_scratch_space; if (threadIdx.y == 0) { - final_reducer.final(reinterpret_cast(shared)); + reducer.final(reinterpret_cast(shared)); } if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) { @@ -234,11 +220,10 @@ class ParallelReduce, ReducerType, } __device__ inline void run(ShflReductionTag) const { - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); + const ReducerType& reducer = m_functor_reducer.get_reducer(); value_type value; - final_reducer.init(&value); + reducer.init(&value); // Number of blocks is bounded so that the reduction can be limited to two // passes. Each thread block is given an approximately equal amount of work // to perform. Accumulate the values for this block. The accumulation @@ -262,18 +247,18 @@ class ParallelReduce, ReducerType, (max_active_thread == 0) ? blockDim.y : max_active_thread; value_type init; - final_reducer.init(&init); + reducer.init(&init); if (m_policy.begin() == m_policy.end()) { - final_reducer.final(&value); + reducer.final(&value); pointer_type const final_result = m_result_ptr_device_accessible ? m_result_ptr : result; *final_result = value; } else if (Impl::hip_inter_block_shuffle_reduction<>( - value, init, final_reducer, m_scratch_space, result, + value, init, reducer, m_scratch_space, result, m_scratch_flags, max_active_thread)) { unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { - final_reducer.final(&value); + reducer.final(&value); pointer_type const final_result = m_result_ptr_device_accessible ? m_result_ptr : result; *final_result = value; @@ -288,23 +273,21 @@ class ParallelReduce, ReducerType, return hip_single_inter_block_reduce_scan_shmem(f, n); }; - using DriverType = - ParallelReduce; - return Kokkos::Impl::hip_get_preferred_blocksize( + return Kokkos::Impl::hip_get_preferred_blocksize( instance, shmem_functor); } inline void execute() { - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); + const ReducerType& reducer = m_functor_reducer.get_reducer(); const index_type nwork = m_policy.end() - m_policy.begin(); - const bool need_device_set = Analysis::has_init_member_function || - Analysis::has_final_member_function || + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || !m_result_ptr_host_accessible || !std::is_same::value; if ((nwork > 0) || need_device_set) { - const int block_size = local_block_size(m_functor); + const int block_size = local_block_size(m_functor_reducer.get_functor()); if (block_size == 0) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " @@ -312,8 +295,7 @@ class ParallelReduce, ReducerType, } m_scratch_space = ::Kokkos::Impl::hip_internal_scratch_space( - m_policy.space(), Analysis::value_size(ReducerConditional::select( - m_functor, m_reducer)) * + m_policy.space(), reducer.value_size() * block_size /* block_size == max block_count */); m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( m_policy.space(), sizeof(size_type)); @@ -333,37 +315,31 @@ class ParallelReduce, ReducerType, UseShflReduction ? 0 : hip_single_inter_block_reduce_scan_shmem(m_functor, - block.y); + WorkTag>( + m_functor_reducer.get_functor(), block.y); - using DriverType = - ParallelReduce; - Kokkos::Impl::hip_parallel_launch( + Kokkos::Impl::hip_parallel_launch( *this, grid, block, shmem, m_policy.space().impl_internal_space_instance(), false); // copy to device and execute if (!m_result_ptr_device_accessible && m_result_ptr) { - const int size = Analysis::value_size( - ReducerConditional::select(m_functor, m_reducer)); + const int size = reducer.value_size(); DeepCopy(m_policy.space(), m_result_ptr, m_scratch_space, size); } } else { if (m_result_ptr) { - final_reducer.init(m_result_ptr); + reducer.init(m_result_ptr); } } } template - ParallelReduce( - const FunctorType& arg_functor, const Policy& arg_policy, - const ViewType& arg_result, - std::enable_if_t::value, void*> = nullptr) - : m_functor(arg_functor), + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result.data()), m_result_ptr_device_accessible( MemorySpaceAccess, ReducerType, m_result_ptr_host_accessible( MemorySpaceAccess::accessible) {} - - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ReducerType& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess::accessible) {} }; template diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index 13772dc903..9f725d28d2 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -83,12 +83,10 @@ class TeamPolicyInternal using functor_analysis_type = Impl::FunctorAnalysis; - using reducer_type = typename Impl::ParallelReduceReturnValue< - void, typename functor_analysis_type::value_type, - FunctorType>::reducer_type; - using closure_type = - Impl::ParallelReduce, - reducer_type>; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer, + TeamPolicy, Kokkos::HIP>; return internal_team_size_common(f); } @@ -96,8 +94,8 @@ class TeamPolicyInternal inline int team_size_max(const FunctorType& f, const ReducerType&, const ParallelReduceTag&) const { using closure_type = - Impl::ParallelReduce, - ReducerType>; + Impl::ParallelReduce, + TeamPolicy, Kokkos::HIP>; return internal_team_size_common(f); } @@ -115,12 +113,10 @@ class TeamPolicyInternal using functor_analysis_type = Impl::FunctorAnalysis; - using reducer_type = typename Impl::ParallelReduceReturnValue< - void, typename functor_analysis_type::value_type, - FunctorType>::reducer_type; - using closure_type = - Impl::ParallelReduce, - reducer_type>; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer, + TeamPolicy, Kokkos::HIP>; return internal_team_size_common(f); } @@ -128,8 +124,8 @@ class TeamPolicyInternal int team_size_recommended(FunctorType const& f, ReducerType const&, ParallelReduceTag const&) const { using closure_type = - Impl::ParallelReduce, - ReducerType>; + Impl::ParallelReduce, + TeamPolicy, Kokkos::HIP>; return internal_team_size_common(f); } @@ -561,37 +557,28 @@ class ParallelFor, HIP> { //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -template -class ParallelReduce, - ReducerType, HIP> { +template +class ParallelReduce, HIP> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicyInternal; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; private: using member_type = typename Policy::member_type; using work_tag = typename Policy::work_tag; using launch_bounds = typename Policy::launch_bounds; - using reducer_conditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using reducer_type_fwd = typename reducer_conditional::type; - using work_tag_fwd = - typename Kokkos::Impl::if_c::value, - work_tag, void>::type; - - using analysis = Impl::FunctorAnalysis; - - using pointer_type = typename analysis::pointer_type; - using reference_type = typename analysis::reference_type; - using value_type = typename analysis::value_type; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + using value_type = typename ReducerType::value_type; public: - using functor_type = FunctorType; - using size_type = HIP::size_type; + using size_type = HIP::size_type; - static int constexpr UseShflReduction = (analysis::StaticValueSize != 0); + static int constexpr UseShflReduction = + (ReducerType::static_value_size() != 0); private: struct ShflReductionTag {}; @@ -605,9 +592,8 @@ class ParallelReduce, // [ team shared space ] // - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; const bool m_result_ptr_host_accessible; @@ -628,13 +614,13 @@ class ParallelReduce, template __device__ inline std::enable_if_t::value> exec_team( member_type const& member, reference_type update) const { - m_functor(member, update); + m_functor_reducer.get_functor()(member, update); } template __device__ inline std::enable_if_t::value> exec_team( member_type const& member, reference_type update) const { - m_functor(TagType(), member, update); + m_functor_reducer.get_functor()(TagType(), member, update); } __device__ inline void iterate_through_league(int const threadid, @@ -673,18 +659,15 @@ class ParallelReduce, } __device__ inline void run(SHMEMReductionTag, int const threadid) const { - typename analysis::Reducer final_reducer( - reducer_conditional::select(m_functor, m_reducer)); + const ReducerType& reducer = m_functor_reducer.get_reducer(); - integral_nonzero_constant const - word_count(analysis::value_size( - reducer_conditional::select(m_functor, m_reducer)) / - sizeof(size_type)); + word_count(reducer.value_size() / sizeof(size_type)); reference_type value = - final_reducer.init(kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value); + reducer.init(kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value); // Iterate this block through the league iterate_through_league(threadid, value); @@ -694,9 +677,9 @@ class ParallelReduce, if (!do_final_reduce) do_final_reduce = hip_single_inter_block_reduce_scan( - reducer_conditional::select(m_functor, m_reducer), blockIdx.x, - gridDim.x, kokkos_impl_hip_shared_memory(), - m_scratch_space, m_scratch_flags); + reducer, blockIdx.x, gridDim.x, + kokkos_impl_hip_shared_memory(), m_scratch_space, + m_scratch_flags); if (do_final_reduce) { // This is the final block with the final result at the final threads' // location @@ -708,7 +691,7 @@ class ParallelReduce, : m_scratch_space; if (threadIdx.y == 0) { - final_reducer.final(reinterpret_cast(shared)); + reducer.final(reinterpret_cast(shared)); } if (HIPTraits::WarpSize < word_count.value) { @@ -722,11 +705,10 @@ class ParallelReduce, } __device__ inline void run(ShflReductionTag, int const threadid) const { - typename analysis::Reducer final_reducer( - reducer_conditional::select(m_functor, m_reducer)); + const ReducerType& reducer = m_functor_reducer.get_reducer(); value_type value; - final_reducer.init(&value); + reducer.init(&value); // Iterate this block through the league iterate_through_league(threadid, value); @@ -737,28 +719,27 @@ class ParallelReduce, : reinterpret_cast(m_scratch_space); value_type init; - final_reducer.init(&init); + reducer.init(&init); if (m_league_size == 0) { - final_reducer.final(&value); + reducer.final(&value); *result = value; } else if (Impl::hip_inter_block_shuffle_reduction( - value, init, final_reducer, m_scratch_space, result, + value, init, reducer, m_scratch_space, result, m_scratch_flags, blockDim.y)) { unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { - final_reducer.final(&value); + reducer.final(&value); *result = value; } } } inline void execute() { - typename analysis::Reducer final_reducer( - reducer_conditional::select(m_functor, m_reducer)); + const ReducerType& reducer = m_functor_reducer.get_reducer(); const bool is_empty_range = m_league_size == 0 || m_team_size == 0; - const bool need_device_set = analysis::has_init_member_function || - analysis::has_final_member_function || + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || !m_result_ptr_host_accessible || !std::is_same::value; if (!is_empty_range || need_device_set) { @@ -768,9 +749,7 @@ class ParallelReduce, : std::min(static_cast(m_league_size), m_team_size); m_scratch_space = hip_internal_scratch_space( - m_policy.space(), analysis::value_size(reducer_conditional::select( - m_functor, m_reducer)) * - block_count); + m_policy.space(), reducer.value_size() * block_count); m_scratch_flags = hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); @@ -782,10 +761,7 @@ class ParallelReduce, } const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - using closure_type = - ParallelReduce, - ReducerType, HIP>; - Impl::hip_parallel_launch( + Impl::hip_parallel_launch( *this, grid, block, shmem_size_total, m_policy.space().impl_internal_space_instance(), true); // copy to device and execute @@ -794,26 +770,22 @@ class ParallelReduce, m_policy.space().impl_internal_space_instance()->fence(); if (m_result_ptr) { - const int size = analysis::value_size( - reducer_conditional::select(m_functor, m_reducer)); + const int size = reducer.value_size(); DeepCopy(m_result_ptr, m_scratch_space, size); } } } else { if (m_result_ptr) { - final_reducer.init(m_result_ptr); + reducer.init(m_result_ptr); } } } template - ParallelReduce( - FunctorType const& arg_functor, Policy const& arg_policy, - ViewType const& arg_result, - std::enable_if_t::value, void*> = nullptr) - : m_functor(arg_functor), + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result.data()), m_result_ptr_device_accessible( MemorySpaceAccess, m_policy.space().impl_internal_space_instance(); m_team_size = m_team_size >= 0 ? m_team_size : arg_policy.team_size_recommended( - arg_functor, ParallelReduceTag()); + arg_functor_reducer.get_functor(), + arg_functor_reducer.get_reducer(), + ParallelReduceTag()); m_team_begin = UseShflReduction ? 0 : hip_single_inter_block_reduce_scan_shmem(arg_functor, - m_team_size); + work_tag>( + arg_functor_reducer.get_functor(), m_team_size); m_shmem_begin = sizeof(double) * (m_team_size + 2); - m_shmem_size = - m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value(arg_functor, m_team_size); + m_shmem_size = m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize::value( + arg_functor_reducer.get_functor(), m_team_size); m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); m_scratch_locks = internal_space_instance->m_scratch_locks; @@ -894,97 +868,9 @@ class ParallelReduce, "L0 scratch memory")); } - size_t max_size = - arg_policy.team_size_max(arg_functor, ParallelReduceTag()); - if (static_cast(m_team_size) > static_cast(max_size)) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " - "large team size.")); - } - } - - ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy, - ReducerType const& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess::accessible), - m_scratch_space(nullptr), - m_scratch_flags(nullptr), - m_team_begin(0), - m_shmem_begin(0), - m_shmem_size(0), - m_scratch_ptr{nullptr, nullptr}, - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { - auto internal_space_instance = - m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 - ? m_team_size - : arg_policy.team_size_recommended(arg_functor, reducer, - ParallelReduceTag()); - m_team_begin = - UseShflReduction - ? 0 - : hip_single_inter_block_reduce_scan_shmem(arg_functor, - m_team_size); - m_shmem_begin = sizeof(double) * (m_team_size + 2); - m_shmem_size = - m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value(arg_functor, m_team_size); - m_scratch_size[0] = m_shmem_size; - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; - m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; - if (m_team_size <= 0) { - m_scratch_ptr[1] = nullptr; - } else { - m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); - m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * - (std::min( - static_cast(HIP().concurrency() / - (m_team_size * m_vector_size)), - static_cast(m_league_size)))); - } - - // The global parallel_reduce does not support vector_length other than 1 at - // the moment - if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " - "greater than 1 is not currently supported for HIP for dynamic " - "sized reduction types."); - - if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " - "than 64 is not currently supported with HIP for dynamic sized " - "reduction types."); - - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) && - !UseShflReduction) || - internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); - } - - size_t max_size = - arg_policy.team_size_max(arg_functor, reducer, ParallelReduceTag()); + size_t max_size = arg_policy.team_size_max( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); if (static_cast(m_team_size) > static_cast(max_size)) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index 3e48380a53..90b367fec8 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -1432,6 +1432,11 @@ template <> struct implements_new_reduce_interface : std::true_type {}; #endif +#ifdef KOKKOS_ENABLE_HIP +template <> +struct implements_new_reduce_interface : std::true_type {}; +#endif + template class ParallelReduceWrapper { From 4f871bea833a20597a3497297f8eb954e7d0f4ff Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 23 Feb 2023 13:44:51 -0500 Subject: [PATCH 254/496] Convert HIP ParallelScan --- core/src/HIP/Kokkos_HIP_Parallel_Range.hpp | 34 +++++++++++++--------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index a646b9b6f5..757250ad4d 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -393,7 +393,8 @@ class ParallelScanHIPBase { // (c) gridDim.x <= blockDim.y * blockDim.y // (d) gridDim.y == gridDim.z == 1 - const FunctorType m_functor; + const CombinedFunctorReducer + m_functor_reducer; const Policy m_policy; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; @@ -406,23 +407,24 @@ class ParallelScanHIPBase { template __device__ inline std::enable_if_t::value> exec_range( const Member& i, reference_type update, const bool final_result) const { - m_functor(i, update, final_result); + m_functor_reducer.get_functor()(i, update, final_result); } template __device__ inline std::enable_if_t::value> exec_range( const Member& i, reference_type update, const bool final_result) const { - m_functor(TagType(), i, update, final_result); + m_functor_reducer.get_functor()(TagType(), i, update, final_result); } //---------------------------------------- __device__ inline void initial() const { - typename Analysis::Reducer final_reducer(m_functor); + const typename Analysis::Reducer& final_reducer = + m_functor_reducer.get_reducer(); const integral_nonzero_constant - word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); + word_count(final_reducer.value_size() / sizeof(word_size_type)); pointer_type const shared_value = reinterpret_cast( kokkos_impl_hip_shared_memory() + @@ -456,11 +458,12 @@ class ParallelScanHIPBase { //---------------------------------------- __device__ inline void final() const { - typename Analysis::Reducer final_reducer(m_functor); + const typename Analysis::Reducer& final_reducer = + m_functor_reducer.get_reducer(); const integral_nonzero_constant - word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); + word_count(final_reducer.value_size() / sizeof(word_size_type)); // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , // value[2] , ... } @@ -571,15 +574,17 @@ class ParallelScanHIPBase { // How many block are really needed for this much work: m_grid_x = (nwork + work_per_block - 1) / work_per_block; + const typename Analysis::Reducer& final_reducer = + m_functor_reducer.get_reducer(); m_scratch_space = reinterpret_cast(Impl::hip_internal_scratch_space( - m_policy.space(), Analysis::value_size(m_functor) * m_grid_x)); + m_policy.space(), final_reducer.value_size() * m_grid_x)); m_scratch_flags = Impl::hip_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); dim3 grid(m_grid_x, 1, 1); dim3 block(1, block_size, 1); // REQUIRED DIMENSIONS ( 1 , N , 1 ) - const int shmem = Analysis::value_size(m_functor) * (block_size + 2); + const int shmem = final_reducer.value_size() * (block_size + 2); m_final = false; // these ones are OK to be just the base because the specializations @@ -601,7 +606,7 @@ class ParallelScanHIPBase { ParallelScanHIPBase(const FunctorType& arg_functor, const Policy& arg_policy, pointer_type arg_result_ptr, bool arg_result_ptr_device_accessible) - : m_functor(arg_functor), + : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} @@ -615,7 +620,8 @@ class ParallelScan, HIP> using Base::operator(); inline void execute() { - const int block_size = static_cast(local_block_size(Base::m_functor)); + const int block_size = static_cast( + local_block_size(Base::m_functor_reducer.get_functor())); if (block_size == 0) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelScan< HIP > could not find a " @@ -658,7 +664,8 @@ class ParallelScanWithTotal, using Base::operator(); inline void execute() { - const int block_size = static_cast(local_block_size(Base::m_functor)); + const int block_size = static_cast( + local_block_size(Base::m_functor_reducer.get_functor())); if (block_size == 0) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelScan< HIP > could not find a " @@ -669,7 +676,8 @@ class ParallelScanWithTotal, const auto nwork = Base::m_policy.end() - Base::m_policy.begin(); if (nwork && !Base::m_result_ptr_device_accessible) { - const int size = Base::Analysis::value_size(Base::m_functor); + const int size = + Base::Analysis::value_size(Base::m_functor_reducer.get_functor()); DeepCopy( Base::m_policy.space(), Base::m_result_ptr, Base::m_scratch_space + (Base::m_grid_x - 1) * size / From 6a3b1d60f5eb947103812635ab92a979a92f61f2 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Mon, 27 Feb 2023 18:01:24 -0700 Subject: [PATCH 255/496] algorithms: Remove workaround for Intel older than the required 19.0.5 Tested on kokkos-dev-2 with sems-intel/19.0.5 RelWithDebInfo --- algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp | 9 --------- algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp | 9 --------- 2 files changed, 18 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp index a4aaba26b9..7c75899cb8 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp @@ -39,16 +39,7 @@ struct StdReverseFunctor { KOKKOS_FUNCTION void operator()(index_type i) const { - // the swap below is doing the same thing, but - // for Intel 18.0.5 does not work. - // But putting the impl directly here, it works. -#ifdef KOKKOS_COMPILER_INTEL - typename InputIterator::value_type tmp = std::move(m_first[i]); - m_first[i] = std::move(m_last[-i - 1]); - m_last[-i - 1] = std::move(tmp); -#else ::Kokkos::Experimental::swap(m_first[i], m_last[-i - 1]); -#endif } StdReverseFunctor(InputIterator first, InputIterator last) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp b/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp index 438acb989f..a5e4786d04 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp @@ -35,16 +35,7 @@ struct StdSwapRangesFunctor { KOKKOS_FUNCTION void operator()(IndexType i) const { - // the swap below is doing the same thing, but - // for Intel 18.0.5 does not work. - // But putting the impl directly here, it works. -#ifdef KOKKOS_COMPILER_INTEL - typename IteratorType1::value_type tmp = std::move(m_first1[i]); - m_first1[i] = std::move(m_first2[i]); - m_first2[i] = std::move(tmp); -#else ::Kokkos::Experimental::swap(m_first1[i], m_first2[i]); -#endif } KOKKOS_FUNCTION From 54e4396dce03551829e149ad5f029b5c8eeaa65c Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Mon, 27 Feb 2023 18:01:24 -0700 Subject: [PATCH 256/496] containers: Remove workaround for Intel older than the required 19.0.5 and GCC < 5 Tested on kokkos-dev-2 with sems-intel/19.0.5 RelWithDebInfo --- containers/unit_tests/TestVector.hpp | 36 +--------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/containers/unit_tests/TestVector.hpp b/containers/unit_tests/TestVector.hpp index fa59607484..a22066f753 100644 --- a/containers/unit_tests/TestVector.hpp +++ b/containers/unit_tests/TestVector.hpp @@ -49,61 +49,27 @@ struct test_vector_insert { it = a.begin(); it += 17; -// Looks like some std::vector implementations do not have the restriction -// right on the overload taking three iterators, and thus the following call -// will hit that overload and then fail to compile. -#if defined(KOKKOS_COMPILER_INTEL) -// And at least GCC 4.8.4 doesn't implement vector insert correct for C++11 -// Return type is void ... -#if (__GNUC__ < 5) - a.insert(it, typename Vector::size_type(n + 5), scalar_type(5)); - it_return = a.begin() + 17; -#else - it_return = a.insert(it, typename Vector::size_type(n + 5), scalar_type(5)); -#endif -#else -#if (__GNUC__ < 5) - a.insert(it, n + 5, scalar_type(5)); - it_return = a.begin() + 17; -#else it_return = a.insert(it, n + 5, scalar_type(5)); -#endif -#endif ASSERT_EQ(a.size(), n + 1 + n + 5); ASSERT_EQ(std::distance(it_return, a.begin() + 17), 0u); Vector b; -// Looks like some std::vector implementations do not have the restriction -// right on the overload taking three iterators, and thus the following call -// will hit that overload and then fail to compile. -#if defined(KOKKOS_COMPILER_INTEL) - b.insert(b.begin(), typename Vector::size_type(7), 9); -#else b.insert(b.begin(), 7, 9); -#endif ASSERT_EQ(b.size(), 7u); ASSERT_EQ(b[0], scalar_type(9)); it = a.begin(); it += 27 + n; -#if (__GNUC__ < 5) - a.insert(it, b.begin(), b.end()); - it_return = a.begin() + (27 + n); -#else it_return = a.insert(it, b.begin(), b.end()); -#endif + ASSERT_EQ(a.size(), n + 1 + n + 5 + 7); ASSERT_EQ(std::distance(it_return, a.begin() + 27 + n), 0u); // Testing insert at end via all three function interfaces a.insert(a.end(), 11); -#if defined(KOKKOS_COMPILER_INTEL) - a.insert(a.end(), typename Vector::size_type(2), 12); -#else a.insert(a.end(), 2, 12); -#endif a.insert(a.end(), b.begin(), b.end()); } From 33e5ef694ef4cd11eae1f4dd931d5d56cd84c3e3 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Mon, 27 Feb 2023 18:40:15 -0700 Subject: [PATCH 257/496] Revert "Fix intel hang" Tested Intel 19.0.5 with OpenMP (4 threads) and Serial on kokkos-dev-2 in a RelWithDebInfo build, and did not observe a hang. So, removing this workaround for older compiler versions that we no longer support. This reverts commit 6185021cc3f5a864b4db6f9fc2c68cfcfc04fbad. --- core/unit_test/TestMDRange.hpp | 118 --------------------------------- 1 file changed, 118 deletions(-) diff --git a/core/unit_test/TestMDRange.hpp b/core/unit_test/TestMDRange.hpp index 3e80e7a01b..e48319dbd7 100644 --- a/core/unit_test/TestMDRange.hpp +++ b/core/unit_test/TestMDRange.hpp @@ -2708,18 +2708,11 @@ struct TestMDRange_6D { const int N3, const int N4, const int N5) { #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = - typename Kokkos::MDRangePolicy, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; -#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -2738,18 +2731,11 @@ struct TestMDRange_6D { #endif { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = - typename Kokkos::MDRangePolicy, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; -#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -2782,18 +2768,11 @@ struct TestMDRange_6D { // Test with reducers - scalar { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = - typename Kokkos::MDRangePolicy, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; -#endif #ifdef KOKKOS_ENABLE_SYCL range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}}, {{3, 3, 3, 2, 2, 2}}); @@ -2816,18 +2795,11 @@ struct TestMDRange_6D { // Test with reducers - scalar + label { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = - typename Kokkos::MDRangePolicy, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; -#endif #ifdef KOKKOS_ENABLE_SYCL range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}}, @@ -2851,19 +2823,12 @@ struct TestMDRange_6D { // Test with reducers - scalar view { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = - typename Kokkos::MDRangePolicy, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy, Kokkos::IndexType, Kokkos::LaunchBounds<512, 1>>; -#endif #ifdef KOKKOS_ENABLE_SYCL range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}}, {{3, 3, 3, 2, 2, 2}}); @@ -2891,18 +2856,11 @@ struct TestMDRange_6D { // Test Min reducer with lambda #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = - typename Kokkos::MDRangePolicy, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; -#endif range_type range({{1, 1, 1, 1, 1, 1}}, {{N0, N1, N2, N3, N4, N5}}, {{3, 3, 3, 2, 2, 1}}); @@ -2935,19 +2893,12 @@ struct TestMDRange_6D { // Tagged operator test { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = typename Kokkos::MDRangePolicy< - ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>, - Kokkos::IndexType, InitTag>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6, Iterate::Default, Iterate::Default>, Kokkos::IndexType, InitTag>; -#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -2999,18 +2950,11 @@ struct TestMDRange_6D { const int N4, const int N5) { #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = - typename Kokkos::MDRangePolicy, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; -#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3059,16 +3003,10 @@ struct TestMDRange_6D { #endif { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = - typename Kokkos::MDRangePolicy>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>>; -#endif using point_type = typename range_type::point_type; range_type range(point_type{{0, 0, 0, 0, 0, 0}}, @@ -3101,18 +3039,11 @@ struct TestMDRange_6D { } { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = - typename Kokkos::MDRangePolicy, - Kokkos::IndexType, InitTag>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>, Kokkos::IndexType, InitTag>; -#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3163,18 +3094,11 @@ struct TestMDRange_6D { } { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = - typename Kokkos::MDRangePolicy, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; -#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3215,19 +3139,12 @@ struct TestMDRange_6D { } { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = typename Kokkos::MDRangePolicy< - ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6, Iterate::Default, Iterate::Default>, Kokkos::IndexType>; -#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3268,19 +3185,12 @@ struct TestMDRange_6D { } { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = typename Kokkos::MDRangePolicy< - ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Left>, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6, Iterate::Left, Iterate::Left>, Kokkos::IndexType>; -#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3321,19 +3231,12 @@ struct TestMDRange_6D { } { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = typename Kokkos::MDRangePolicy< - ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Right>, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6, Iterate::Left, Iterate::Right>, Kokkos::IndexType>; -#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3374,19 +3277,12 @@ struct TestMDRange_6D { } { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = typename Kokkos::MDRangePolicy< - ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Left>, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6, Iterate::Right, Iterate::Left>, Kokkos::IndexType>; -#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3427,19 +3323,12 @@ struct TestMDRange_6D { } { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = typename Kokkos::MDRangePolicy< - ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Right>, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6, Iterate::Right, Iterate::Right>, Kokkos::IndexType>; -#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3790,18 +3679,11 @@ struct TestMDRange_6D_NegIdx { static void test_6D_negidx(const int N0, const int N1, const int N2, const int N3, const int N4, const int N5) { { -#if defined(KOKKOS_COMPILER_INTEL) - // Launchbounds causes hang with intel compilers - using range_type = - typename Kokkos::MDRangePolicy, - Kokkos::IndexType>; -#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<256, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; -#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; From 12708a1e46406aa91d5eb328e216d6f9bb5727c4 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Tue, 28 Feb 2023 10:32:50 -0500 Subject: [PATCH 258/496] Use insertion sort for sort within a bin in BinSort (#5890) * Use insertion sort for sort within a bin in BinSort * Fix variable shadowing --- algorithms/src/Kokkos_Sort.hpp | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/algorithms/src/Kokkos_Sort.hpp b/algorithms/src/Kokkos_Sort.hpp index cb6800409a..995ac3c94e 100644 --- a/algorithms/src/Kokkos_Sort.hpp +++ b/algorithms/src/Kokkos_Sort.hpp @@ -456,24 +456,18 @@ class BinSort { void operator()(const bin_sort_bins_tag& /*tag*/, const int i) const { auto bin_size = bin_count_const(i); if (bin_size <= 1) return; - int upper_bound = bin_offsets(i) + bin_size; - bool sorted = false; - while (!sorted) { - sorted = true; - int old_idx = sort_order(bin_offsets(i)); - int new_idx = 0; - for (int k = bin_offsets(i) + 1; k < upper_bound; k++) { - new_idx = sort_order(k); - - if (!bin_op(keys_rnd, old_idx, new_idx)) { - sort_order(k - 1) = new_idx; - sort_order(k) = old_idx; - sorted = false; - } else { - old_idx = new_idx; - } + int lower_bound = bin_offsets(i); + int upper_bound = lower_bound + bin_size; + for (int k = lower_bound + 1; k < upper_bound; ++k) { + int old_idx = sort_order(k); + int j = k - 1; + while (j >= lower_bound) { + int new_idx = sort_order(j); + if (!bin_op(keys_rnd, old_idx, new_idx)) break; + sort_order(j + 1) = new_idx; + --j; } - upper_bound--; + sort_order(j + 1) = old_idx; } } }; From f93e48a3bf8e11d005a17937d71ac7f5420d3eca Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 23 Feb 2023 12:20:36 -0500 Subject: [PATCH 259/496] Don't call the functor's destructor on the device for Serial and Cuda --- core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 59 +++++++++++-------- .../Serial/Kokkos_Serial_Parallel_Range.hpp | 31 ++++++---- 2 files changed, 54 insertions(+), 36 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 620ef67927..904d1d670e 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -421,7 +421,8 @@ class ParallelScan, Kokkos::Cuda> { // (c) gridDim.x <= blockDim.y * blockDim.y // (d) gridDim.y == gridDim.z == 1 - const FunctorType m_functor; + const CombinedFunctorReducer + m_functor_reducer; const Policy m_policy; word_size_type* m_scratch_space; size_type* m_scratch_flags; @@ -433,23 +434,25 @@ class ParallelScan, Kokkos::Cuda> { template __device__ inline std::enable_if_t::value> exec_range( const Member& i, reference_type update, const bool final_result) const { - m_functor(i, update, final_result); + m_functor_reducer.get_functor()(i, update, final_result); } template __device__ inline std::enable_if_t::value> exec_range( const Member& i, reference_type update, const bool final_result) const { - m_functor(TagType(), i, update, final_result); + m_functor_reducer.get_functor()(TagType(), i, update, final_result); } //---------------------------------------- __device__ inline void initial() const { - typename Analysis::Reducer final_reducer(m_functor); + const typename Analysis::Reducer& final_reducer = + m_functor_reducer.get_reducer(); const integral_nonzero_constant - word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); + word_count(Analysis::value_size(m_functor_reducer.get_functor()) / + sizeof(word_size_type)); word_size_type* const shared_value = kokkos_impl_cuda_shared_memory() + @@ -485,11 +488,13 @@ class ParallelScan, Kokkos::Cuda> { //---------------------------------------- __device__ inline void final() const { - typename Analysis::Reducer final_reducer(m_functor); + const typename Analysis::Reducer& final_reducer = + m_functor_reducer.get_reducer(); const integral_nonzero_constant - word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); + word_count(Analysis::value_size(m_functor_reducer.get_functor()) / + sizeof(word_size_type)); // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , // value[2] , ... } @@ -619,7 +624,7 @@ class ParallelScan, Kokkos::Cuda> { if (nwork) { constexpr int GridMaxComputeCapability_2x = 0x0ffff; - const int block_size = local_block_size(m_functor); + const int block_size = local_block_size(m_functor_reducer.get_functor()); KOKKOS_ASSERT(block_size > 0); const int grid_max = @@ -639,13 +644,15 @@ class ParallelScan, Kokkos::Cuda> { m_scratch_space = reinterpret_cast(cuda_internal_scratch_space( - m_policy.space(), Analysis::value_size(m_functor) * grid_x)); + m_policy.space(), + Analysis::value_size(m_functor_reducer.get_functor()) * grid_x)); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); dim3 grid(grid_x, 1, 1); dim3 block(1, block_size, 1); // REQUIRED DIMENSIONS ( 1 , N , 1 ) - const int shmem = Analysis::value_size(m_functor) * (block_size + 2); + const int shmem = Analysis::value_size(m_functor_reducer.get_functor()) * + (block_size + 2); #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (m_run_serial) { @@ -670,7 +677,7 @@ class ParallelScan, Kokkos::Cuda> { } ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) - : m_functor(arg_functor), + : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_scratch_space(nullptr), m_scratch_flags(nullptr), @@ -728,7 +735,8 @@ class ParallelScanWithTotal, // (c) gridDim.x <= blockDim.y * blockDim.y // (d) gridDim.y == gridDim.z == 1 - const FunctorType m_functor; + const CombinedFunctorReducer + m_functor_reducer; const Policy m_policy; word_size_type* m_scratch_space; size_type* m_scratch_flags; @@ -743,23 +751,25 @@ class ParallelScanWithTotal, template __device__ inline std::enable_if_t::value> exec_range( const Member& i, reference_type update, const bool final_result) const { - m_functor(i, update, final_result); + m_functor_reducer.get_functor()(i, update, final_result); } template __device__ inline std::enable_if_t::value> exec_range( const Member& i, reference_type update, const bool final_result) const { - m_functor(TagType(), i, update, final_result); + m_functor_reducer.get_functor()(TagType(), i, update, final_result); } //---------------------------------------- __device__ inline void initial() const { - typename Analysis::Reducer final_reducer(m_functor); + const typename Analysis::Reducer& final_reducer = + m_functor_reducer.get_reducer(); const integral_nonzero_constant - word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); + word_count(Analysis::value_size(m_functor_reducer.get_functor()) / + sizeof(word_size_type)); word_size_type* const shared_value = kokkos_impl_cuda_shared_memory() + @@ -795,11 +805,12 @@ class ParallelScanWithTotal, //---------------------------------------- __device__ inline void final() const { - typename Analysis::Reducer final_reducer(m_functor); + const typename Analysis::Reducer& final_reducer = + m_functor_reducer.get_reducer(); const integral_nonzero_constant - word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); + word_count(final_reducer.value_size() / sizeof(word_size_type)); // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , // value[2] , ... } @@ -935,7 +946,7 @@ class ParallelScanWithTotal, if (nwork) { enum { GridMaxComputeCapability_2x = 0x0ffff }; - const int block_size = local_block_size(m_functor); + const int block_size = local_block_size(m_functor_reducer.get_functor()); KOKKOS_ASSERT(block_size > 0); const int grid_max = @@ -953,15 +964,17 @@ class ParallelScanWithTotal, // How many block are really needed for this much work: const int grid_x = (nwork + work_per_block - 1) / work_per_block; + const typename Analysis::Reducer& final_reducer = + m_functor_reducer.get_reducer(); m_scratch_space = reinterpret_cast(cuda_internal_scratch_space( - m_policy.space(), Analysis::value_size(m_functor) * grid_x)); + m_policy.space(), final_reducer.value_size() * grid_x)); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); dim3 grid(grid_x, 1, 1); dim3 block(1, block_size, 1); // REQUIRED DIMENSIONS ( 1 , N , 1 ) - const int shmem = Analysis::value_size(m_functor) * (block_size + 2); + const int shmem = final_reducer.value_size() * (block_size + 2); #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (m_run_serial) { @@ -982,7 +995,7 @@ class ParallelScanWithTotal, m_policy.space() .impl_internal_space_instance()); // copy to device and execute - const int size = Analysis::value_size(m_functor); + const int size = final_reducer.value_size(); #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (m_run_serial) DeepCopy(m_policy.space(), &m_returnvalue, @@ -1003,7 +1016,7 @@ class ParallelScanWithTotal, ParallelScanWithTotal(const FunctorType& arg_functor, const Policy& arg_policy, const ViewType& arg_result_view) - : m_functor(arg_functor), + : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_scratch_space(nullptr), m_scratch_flags(nullptr), diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 01089677a2..5840cc736d 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -154,7 +154,8 @@ class ParallelScan, using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; - const FunctorType m_functor; + const CombinedFunctorReducer + m_functor_reducer; const Policy m_policy; template @@ -162,7 +163,7 @@ class ParallelScan, reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { - m_functor(i, update, true); + m_functor_reducer.get_functor()(i, update, true); } } @@ -172,13 +173,15 @@ class ParallelScan, const TagType t{}; const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { - m_functor(t, i, update, true); + m_functor_reducer.get_functor()(t, i, update, true); } } public: inline void execute() const { - const size_t pool_reduce_size = Analysis::value_size(m_functor); + const typename Analysis::Reducer& final_reducer = + m_functor_reducer.get_reducer(); + const size_t pool_reduce_size = final_reducer.value_size(); const size_t team_reduce_size = 0; // Never shrinks const size_t team_shared_size = 0; // Never shrinks const size_t thread_local_size = 0; // Never shrinks @@ -191,8 +194,6 @@ class ParallelScan, pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); - typename Analysis::Reducer final_reducer(m_functor); - reference_type update = final_reducer.init(pointer_type( internal_instance->m_thread_team_data.pool_reduce_local())); @@ -200,7 +201,8 @@ class ParallelScan, } inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} + : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), + m_policy(arg_policy) {} }; /*--------------------------------------------------------------------------*/ @@ -218,7 +220,8 @@ class ParallelScanWithTotal, using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; - const FunctorType m_functor; + const CombinedFunctorReducer + m_functor_reducer; const Policy m_policy; const pointer_type m_result_ptr; @@ -227,7 +230,7 @@ class ParallelScanWithTotal, reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { - m_functor(i, update, true); + m_functor_reducer.get_functor()(i, update, true); } } @@ -237,13 +240,14 @@ class ParallelScanWithTotal, const TagType t{}; const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { - m_functor(t, i, update, true); + m_functor_reducer.get_functor()(t, i, update, true); } } public: inline void execute() { - const size_t pool_reduce_size = Analysis::value_size(m_functor); + const size_t pool_reduce_size = + m_functor_reducer.get_reducer().value_size(); const size_t team_reduce_size = 0; // Never shrinks const size_t team_shared_size = 0; // Never shrinks const size_t thread_local_size = 0; // Never shrinks @@ -256,7 +260,8 @@ class ParallelScanWithTotal, pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); - typename Analysis::Reducer final_reducer(m_functor); + const typename Analysis::Reducer& final_reducer = + m_functor_reducer.get_reducer(); reference_type update = final_reducer.init(pointer_type( internal_instance->m_thread_team_data.pool_reduce_local())); @@ -271,7 +276,7 @@ class ParallelScanWithTotal, ParallelScanWithTotal(const FunctorType& arg_functor, const Policy& arg_policy, const ViewType& arg_result_view) - : m_functor(arg_functor), + : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_view.data()) { static_assert( From db890c979c1ddcb6a0dc2e8db48ae2a6a8e0e190 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 23 Feb 2023 13:20:03 -0500 Subject: [PATCH 260/496] Add test case --- .../Test05_ParallelReduce_RangePolicy.hpp | 2 +- .../incremental/Test16_ParallelScan.hpp | 49 +++++++++++++++---- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp b/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp index 8175c23565..4235c73c8e 100644 --- a/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp +++ b/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp @@ -45,7 +45,7 @@ struct NonTrivialReduceFunctor { NonTrivialReduceFunctor(NonTrivialReduceFunctor &&) = default; NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor &&) = default; NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor const &) = default; - KOKKOS_FUNCTION ~NonTrivialReduceFunctor() {} + ~NonTrivialReduceFunctor() {} }; template diff --git a/core/unit_test/incremental/Test16_ParallelScan.hpp b/core/unit_test/incremental/Test16_ParallelScan.hpp index 78f791914a..0b9d16f79d 100644 --- a/core/unit_test/incremental/Test16_ParallelScan.hpp +++ b/core/unit_test/incremental/Test16_ParallelScan.hpp @@ -26,11 +26,45 @@ namespace Test { using value_type = double; const int N = 10; +template +struct TrivialScanFunctor { + Kokkos::View d_data; + + KOKKOS_FUNCTION + void operator()(const int i, value_type &update_value, + const bool final) const { + const value_type val_i = d_data(i); + if (final) d_data(i) = update_value; + update_value += val_i; + } +}; + +template +struct NonTrivialScanFunctor { + Kokkos::View d_data; + + KOKKOS_FUNCTION + void operator()(const int i, value_type &update_value, + const bool final) const { + const value_type val_i = d_data(i); + if (final) d_data(i) = update_value; + update_value += val_i; + } + + NonTrivialScanFunctor() = default; + NonTrivialScanFunctor(NonTrivialScanFunctor const &) = default; + NonTrivialScanFunctor(NonTrivialScanFunctor &&) = default; + NonTrivialScanFunctor &operator=(NonTrivialScanFunctor &&) = default; + NonTrivialScanFunctor &operator=(NonTrivialScanFunctor const &) = default; + ~NonTrivialScanFunctor() {} +}; + template struct TestScan { // 1D View of double using View_1D = typename Kokkos::View; + template void parallel_scan() { View_1D d_data("data", N); @@ -39,15 +73,9 @@ struct TestScan { Kokkos::RangePolicy(0, N), KOKKOS_LAMBDA(const int i) { d_data(i) = i * 0.5; }); - // Exclusive parallel_scan call. - Kokkos::parallel_scan( - Kokkos::RangePolicy(0, N), - KOKKOS_LAMBDA(const int i, value_type &update_value, const bool final) { - const value_type val_i = d_data(i); - if (final) d_data(i) = update_value; - - update_value += val_i; - }); + // Exclusive parallel_scan call + Kokkos::parallel_scan(Kokkos::RangePolicy(0, N), + FunctorType{d_data}); // Copy back the data. auto h_data = @@ -65,7 +93,8 @@ struct TestScan { TEST(TEST_CATEGORY, IncrTest_16_parallelscan) { TestScan test; - test.parallel_scan(); + test.parallel_scan>(); + test.parallel_scan>(); } } // namespace Test From 1cf890742c70996991bc54ff7e8e5fc91e703c6a Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Sat, 18 Feb 2023 12:11:01 -0500 Subject: [PATCH 261/496] Use std::sort for sorting within a bin when possible --- algorithms/src/Kokkos_Sort.hpp | 37 +++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/algorithms/src/Kokkos_Sort.hpp b/algorithms/src/Kokkos_Sort.hpp index 995ac3c94e..53f230791f 100644 --- a/algorithms/src/Kokkos_Sort.hpp +++ b/algorithms/src/Kokkos_Sort.hpp @@ -456,18 +456,37 @@ class BinSort { void operator()(const bin_sort_bins_tag& /*tag*/, const int i) const { auto bin_size = bin_count_const(i); if (bin_size <= 1) return; + constexpr bool use_std_sort = +#ifdef KOKKOS_ENABLE_SERIAL + std::is_same_v || +#endif +#ifdef KOKKOS_ENABLE_OPENMP + std::is_same_v || +#endif + false; int lower_bound = bin_offsets(i); int upper_bound = lower_bound + bin_size; - for (int k = lower_bound + 1; k < upper_bound; ++k) { - int old_idx = sort_order(k); - int j = k - 1; - while (j >= lower_bound) { - int new_idx = sort_order(j); - if (!bin_op(keys_rnd, old_idx, new_idx)) break; - sort_order(j + 1) = new_idx; - --j; + if (use_std_sort && bin_size > 10) { + if constexpr (use_std_sort) { + auto& bin_op_c = bin_op; + auto& keys_rnd_c = keys_rnd; + std::sort(&sort_order(lower_bound), &sort_order(upper_bound), + [&bin_op_c, &keys_rnd_c](int p, int q) { + return bin_op_c(keys_rnd_c, p, q); + }); + } + } else { + for (int k = lower_bound + 1; k < upper_bound; ++k) { + int old_idx = sort_order(k); + int j = k - 1; + while (j >= lower_bound) { + int new_idx = sort_order(j); + if (!bin_op(keys_rnd, old_idx, new_idx)) break; + sort_order(j + 1) = new_idx; + --j; + } + sort_order(j + 1) = old_idx; } - sort_order(j + 1) = old_idx; } } }; From 416d7b7439f6dca7bc4c1b2c4e48c6f8e46a9d1f Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Tue, 28 Feb 2023 12:34:37 -0500 Subject: [PATCH 262/496] New OpenACC backend implementation for parallel_scan with a range policy (#5876) * Initial OpenACC parallel_scan() implementation with Range Policy, which supports general SCAN types. * Fix bugs related to race conditions + add support for parallel_scan with view data. * Apply suggestions from code review Co-authored-by: Daniel Arndt * Remove `getTotal` parameter from `OpenACCParallelScanRangePolicy()` * Update core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp Co-authored-by: Damien L-G * Add a comment about the reference to the parallel scan algorithm that OpenACCParallelScanRangePolicy() implements. Delete unnecessary comments. * Fixup following CombinedFunctorReducerType changes --------- Co-authored-by: Daniel Arndt Co-authored-by: Damien L-G Co-authored-by: Damien L-G --- .../Kokkos_OpenACC_ParallelScan_Range.hpp | 264 ++++++++++++++++++ core/src/decl/Kokkos_Declare_OPENACC.hpp | 1 + core/unit_test/CMakeLists.txt | 4 +- 3 files changed, 266 insertions(+), 3 deletions(-) create mode 100644 core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp new file mode 100644 index 0000000000..82401fd021 --- /dev/null +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp @@ -0,0 +1,264 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENACC_PARALLEL_SCAN_RANGE_HPP +#define KOKKOS_OPENACC_PARALLEL_SCAN_RANGE_HPP + +#include +#include +#include + +template +class Kokkos::Impl::ParallelScan, + Kokkos::Experimental::OpenACC> { + protected: + using Policy = Kokkos::RangePolicy; + using Analysis = + Kokkos::Impl::FunctorAnalysis; + using PointerType = typename Analysis::pointer_type; + using ValueType = typename Analysis::value_type; + using MemberType = typename Policy::member_type; + using IndexType = typename Policy::index_type; + Functor m_functor; + Policy m_policy; + ValueType* m_result_ptr; + bool m_result_ptr_device_accessible; + static constexpr MemberType default_scan_chunk_size = 128; + + public: + ParallelScan(Functor const& arg_functor, Policy const& arg_policy, + ValueType* arg_result_ptr = nullptr, + bool arg_result_ptr_device_accessible = false) + : m_functor(arg_functor), + m_policy(arg_policy), + m_result_ptr(arg_result_ptr), + m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} + + // This function implements the parallel scan alogithm based on the parallel + // prefix sum algorithm proposed by Hillis and Steele (doi:10.1145/7902.7903), + // which offers a shorter span and more parallelism but may not be + // work-efficient. + void OpenACCParallelScanRangePolicy(const IndexType begin, + const IndexType end, IndexType chunk_size, + const int async_arg) const { + if (chunk_size > 1) { + if (!Impl::is_integral_power_of_two(chunk_size)) + Kokkos::abort( + "RangePolicy blocking granularity must be power of two to be used " + "with OpenACC parallel_scan()"); + } else { + chunk_size = default_scan_chunk_size; + } + const Kokkos::Experimental::Impl::FunctorAdapter functor( + m_functor); + const IndexType N = end - begin; + const IndexType n_chunks = (N + chunk_size - 1) / chunk_size; + Kokkos::View chunk_values( + "Kokkos::OpenACCParallelScan::chunk_values", n_chunks); + Kokkos::View offset_values( + "Kokkos::OpenACCParallelScan::offset_values", n_chunks); + Kokkos::View m_result_total( + "Kokkos::OpenACCParallelScan::m_result_total"); + std::unique_ptr element_values_owner( + new ValueType[2 * chunk_size]); + ValueType* element_values = element_values_owner.get(); + typename Analysis::Reducer final_reducer(m_functor); + +#pragma acc enter data copyin(functor, final_reducer) \ + copyin(chunk_values, offset_values) async(async_arg) + +#pragma acc parallel loop gang vector_length(chunk_size) private( \ + element_values [0:2 * chunk_size]) \ + present(functor, chunk_values, final_reducer) async(async_arg) + for (IndexType team_id = 0; team_id < n_chunks; ++team_id) { + IndexType current_step = 0; + IndexType next_step = 1; + IndexType temp; +#pragma acc loop vector + for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) { + const IndexType local_offset = team_id * chunk_size; + const IndexType idx = local_offset + thread_id; + ValueType update; + final_reducer.init(&update); + if ((idx > 0) && (idx < N)) functor(idx - 1, update, false); + element_values[thread_id] = update; + } + for (IndexType step_size = 1; step_size < chunk_size; step_size *= 2) { +#pragma acc loop vector + for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) { + if (thread_id < step_size) { + element_values[next_step * chunk_size + thread_id] = + element_values[current_step * chunk_size + thread_id]; + } else { + ValueType localValue = + element_values[current_step * chunk_size + thread_id]; + final_reducer.join(&localValue, + &element_values[current_step * chunk_size + + thread_id - step_size]); + element_values[next_step * chunk_size + thread_id] = localValue; + } + } + temp = current_step; + current_step = next_step; + next_step = temp; + } + chunk_values(team_id) = + element_values[current_step * chunk_size + chunk_size - 1]; + } + + ValueType tempValue; +#pragma acc serial loop present(chunk_values, offset_values, final_reducer) \ + async(async_arg) + for (IndexType team_id = 0; team_id < n_chunks; ++team_id) { + if (team_id == 0) { + final_reducer.init(&offset_values(0)); + final_reducer.init(&tempValue); + } else { + final_reducer.join(&tempValue, &chunk_values(team_id - 1)); + offset_values(team_id) = tempValue; + } + } + +#pragma acc parallel loop gang vector_length(chunk_size) private( \ + element_values [0:2 * chunk_size]) \ + present(functor, offset_values, final_reducer) copyin(m_result_total) \ + async(async_arg) + for (IndexType team_id = 0; team_id < n_chunks; ++team_id) { + IndexType current_step = 0; + IndexType next_step = 1; + IndexType temp; +#pragma acc loop vector + for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) { + const IndexType local_offset = team_id * chunk_size; + const IndexType idx = local_offset + thread_id; + ValueType update; + final_reducer.init(&update); + if (thread_id == 0) { + final_reducer.join(&update, &offset_values(team_id)); + } + if ((idx > 0) && (idx < N)) functor(idx - 1, update, false); + element_values[thread_id] = update; + } + for (IndexType step_size = 1; step_size < chunk_size; step_size *= 2) { +#pragma acc loop vector + for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) { + if (thread_id < step_size) { + element_values[next_step * chunk_size + thread_id] = + element_values[current_step * chunk_size + thread_id]; + } else { + ValueType localValue = + element_values[current_step * chunk_size + thread_id]; + final_reducer.join(&localValue, + &element_values[current_step * chunk_size + + thread_id - step_size]); + element_values[next_step * chunk_size + thread_id] = localValue; + } + } + temp = current_step; + current_step = next_step; + next_step = temp; + } +#pragma acc loop vector + for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) { + const IndexType local_offset = team_id * chunk_size; + const IndexType idx = local_offset + thread_id; + ValueType update = + element_values[current_step * chunk_size + thread_id]; + if (idx < N) functor(idx, update, true); + if (idx == N - 1) { + if (m_result_ptr_device_accessible) { + *m_result_ptr = update; + } else { + m_result_total() = update; + } + } + } + } + if (!m_result_ptr_device_accessible && m_result_ptr != nullptr) { + DeepCopy(m_policy.space(), m_result_ptr, + m_result_total.data(), + sizeof(ValueType)); + } + +#pragma acc exit data delete (functor, chunk_values, offset_values, \ + final_reducer)async(async_arg) + acc_wait(async_arg); + } + + void execute() const { + const IndexType begin = m_policy.begin(); + const IndexType end = m_policy.end(); + IndexType chunk_size = m_policy.chunk_size(); + + if (end <= begin) { + if (!m_result_ptr_device_accessible && m_result_ptr != nullptr) { + *m_result_ptr = 0; + } + return; + } + + int const async_arg = m_policy.space().acc_async_queue(); + + OpenACCParallelScanRangePolicy(begin, end, chunk_size, async_arg); + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template +class Kokkos::Impl::ParallelScanWithTotal< + FunctorType, Kokkos::RangePolicy, ReturnType, + Kokkos::Experimental::OpenACC> + : public ParallelScan, + Kokkos::Experimental::OpenACC> { + using base_t = ParallelScan, + Kokkos::Experimental::OpenACC>; + using ValueType = typename base_t::ValueType; + using IndexType = typename base_t::IndexType; + + public: + void execute() const { + const IndexType begin = base_t::m_policy.begin(); + const IndexType end = base_t::m_policy.end(); + IndexType chunk_size = base_t::m_policy.chunk_size(); + + if (end <= begin) { + if (!base_t::m_result_ptr_device_accessible && + base_t::m_result_ptr != nullptr) { + *base_t::m_result_ptr = 0; + } + return; + } + + int const async_arg = base_t::m_policy.space().acc_async_queue(); + + OpenACCParallelScanRangePolicy(begin, end, chunk_size, async_arg); + } + + template + ParallelScanWithTotal(const FunctorType& arg_functor, + const typename base_t::Policy& arg_policy, + const ViewType& arg_result_view) + : base_t(arg_functor, arg_policy, arg_result_view.data(), + MemorySpaceAccess::accessible) { + } +}; + +#endif diff --git a/core/src/decl/Kokkos_Declare_OPENACC.hpp b/core/src/decl/Kokkos_Declare_OPENACC.hpp index 177af9b23d..727e551cd8 100644 --- a/core/src/decl/Kokkos_Declare_OPENACC.hpp +++ b/core/src/decl/Kokkos_Declare_OPENACC.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index dc374711e6..d5041442aa 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -58,7 +58,7 @@ SET(KOKKOS_SYCL_FEATURE_LEVEL 999) SET(KOKKOS_SYCL_NAME Experimental::SYCL) SET(KOKKOS_THREADS_FEATURE_LEVEL 999) SET(KOKKOS_THREADS_NAME Threads) -SET(KOKKOS_OPENACC_FEATURE_LEVEL 14) +SET(KOKKOS_OPENACC_FEATURE_LEVEL 16) SET(KOKKOS_OPENACC_NAME Experimental::OpenACC) @@ -495,7 +495,6 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtrAccessOnDevice.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TestScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TestTeamScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TestTeamReductionScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp @@ -519,7 +518,6 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_e.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Scan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_NumericTraits.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions.cpp From 2a7629dd650de27382d8975cb3bd7b72e3e90eac Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 28 Feb 2023 14:31:27 -0500 Subject: [PATCH 263/496] Prefer non Impl:: atomic_{load,store} in AtomicDataElement since using relaxed memory order --- core/src/impl/Kokkos_Atomic_View.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/core/src/impl/Kokkos_Atomic_View.hpp b/core/src/impl/Kokkos_Atomic_View.hpp index 45d01c9f9f..23d4c2524c 100644 --- a/core/src/impl/Kokkos_Atomic_View.hpp +++ b/core/src/impl/Kokkos_Atomic_View.hpp @@ -39,7 +39,7 @@ class AtomicDataElement { KOKKOS_INLINE_FUNCTION const_value_type operator=(const_value_type& val) const { - Kokkos::Impl::atomic_store(ptr, val, Kokkos::Impl::memory_order_relaxed); + Kokkos::atomic_store(ptr, val); return val; } @@ -194,9 +194,7 @@ class AtomicDataElement { bool operator>(const_value_type& val) const { return *ptr > val; } KOKKOS_INLINE_FUNCTION - operator value_type() const { - return Kokkos::Impl::atomic_load(ptr, Kokkos::Impl::memory_order_relaxed); - } + operator value_type() const { return Kokkos::atomic_load(ptr); } }; template From 3bcf389e2b7170d6e2877b7724ad06c346e943ed Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 28 Feb 2023 14:34:46 -0500 Subject: [PATCH 264/496] Use directly memory order from desul in Impl:: atomic funtion templates --- core/src/Kokkos_Atomics_Desul_Wrapper.hpp | 20 ++++++++------------ core/src/impl/Kokkos_ChaseLev.hpp | 16 +++++++++------- core/src/impl/Kokkos_HostThreadTeam.hpp | 8 ++++---- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index b8697d415a..519ed3eb89 100644 --- a/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -253,26 +253,22 @@ namespace Impl { using type = desul::MemoryOrderRelaxed; }; template KOKKOS_INLINE_FUNCTION - bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess, MemOrderFailure) { - return desul::atomic_compare_exchange_strong(dest, expected, desired, - typename KokkosToDesulMemoryOrder::type(), - typename KokkosToDesulMemoryOrder::type(), - KOKKOS_DESUL_MEM_SCOPE); - + bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess succ, MemOrderFailure fail) { + return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, fail, KOKKOS_DESUL_MEM_SCOPE); } template KOKKOS_INLINE_FUNCTION - T atomic_load(const T* const src, MemoryOrder) { - return desul::atomic_load(src, typename KokkosToDesulMemoryOrder::type(), KOKKOS_DESUL_MEM_SCOPE); + T atomic_load(const T* const src, MemoryOrder order) { + return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); } template KOKKOS_INLINE_FUNCTION - void atomic_store(T* const src, const T val, MemoryOrder) { - return desul::atomic_store(src, val, typename KokkosToDesulMemoryOrder::type(), KOKKOS_DESUL_MEM_SCOPE); + void atomic_store(T* const src, const T val, MemoryOrder order) { + return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); } -} +} // namespace Impl -} +} // namespace Kokkos #undef KOKKOS_DESUL_MEM_SCOPE diff --git a/core/src/impl/Kokkos_ChaseLev.hpp b/core/src/impl/Kokkos_ChaseLev.hpp index 855654408e..d8ab77b205 100644 --- a/core/src/impl/Kokkos_ChaseLev.hpp +++ b/core/src/impl/Kokkos_ChaseLev.hpp @@ -172,7 +172,8 @@ struct ChaseLevDeque { } #else if (!Impl::atomic_compare_exchange_strong( - &m_top, t, t + 1, memory_order_seq_cst, memory_order_relaxed)) { + &m_top, t, t + 1, desul::MemoryOrderSeqCst(), + desul::MemoryOrderRelaxed())) { /* failed race, someone else stole it */ return_value = nullptr; } @@ -195,7 +196,7 @@ struct ChaseLevDeque { KOKKOS_INLINE_FUNCTION bool push(node_type& node) { auto b = m_bottom; // memory order relaxed - auto t = Impl::atomic_load(&m_top, memory_order_acquire); + auto t = Impl::atomic_load(&m_top, desul::MemoryOrderAcquire()); auto& a = m_array; if (b - t > a.size() - 1) { /* queue is full, resize */ @@ -204,7 +205,7 @@ struct ChaseLevDeque { return false; } a[b] = &node; // relaxed - Impl::atomic_store(&m_bottom, b + 1, memory_order_release); + Impl::atomic_store(&m_bottom, b + 1, desul::MemoryOrderRelease()); return true; } @@ -213,7 +214,7 @@ struct ChaseLevDeque { auto t = m_top; // TODO @tasking @memory_order DSH: atomic load acquire Kokkos::memory_fence(); // seq_cst fence, so why does the above need to be // acquire? - auto b = Impl::atomic_load(&m_bottom, memory_order_acquire); + auto b = Impl::atomic_load(&m_bottom, desul::MemoryOrderAcquire()); OptionalRef return_value; if (t < b) { /* Non-empty queue */ @@ -231,8 +232,9 @@ struct ChaseLevDeque { return_value = nullptr; } #else - if (!Impl::atomic_compare_exchange_strong( - &m_top, t, t + 1, memory_order_seq_cst, memory_order_relaxed)) { + if (!Impl::atomic_compare_exchange_strong(&m_top, t, t + 1, + desul::MemoryOrderSeqCst(), + desul::MemoryOrderRelaxed())) { return_value = nullptr; } #endif @@ -247,7 +249,7 @@ struct ChaseLevDeque { // essentially using the memory order in this version as a fence, which // may be unnecessary auto buffer_ptr = (node_type***)&m_array.buffer; - auto a = Impl::atomic_load(buffer_ptr, memory_order_acquire); // + auto a = Impl::atomic_load(buffer_ptr, desul::MemoryOrderAcquire()); // technically consume ordered, but acquire should be fine return_value = *static_cast(a[t % m_array->size]); // relaxed; we'd have to replace the m_array->size if we ever allow growth diff --git a/core/src/impl/Kokkos_HostThreadTeam.hpp b/core/src/impl/Kokkos_HostThreadTeam.hpp index 1fec93237a..35ced1b56c 100644 --- a/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -481,14 +481,14 @@ class HostThreadTeamMember { // with a return value of 'true' Kokkos::Impl::atomic_store(shared_value, value, - Kokkos::Impl::memory_order_release); + desul::MemoryOrderRelease()); m_data.team_rendezvous_release(); // This thread released all other threads from 'team_rendezvous' // with a return value of 'false' } else { value = Kokkos::Impl::atomic_load(shared_value, - Kokkos::Impl::memory_order_acquire); + desul::MemoryOrderAcquire()); } })) @@ -516,7 +516,7 @@ class HostThreadTeamMember { if (1 < m_data.m_team_size) { Kokkos::Impl::atomic_store(shared_value, value, - Kokkos::Impl::memory_order_release); + desul::MemoryOrderRelease()); } m_data.team_rendezvous_release(); @@ -524,7 +524,7 @@ class HostThreadTeamMember { // with a return value of 'false' } else { value = Kokkos::Impl::atomic_load(shared_value, - Kokkos::Impl::memory_order_acquire); + desul::MemoryOrderAcquire()); })) KOKKOS_IF_ON_DEVICE( From 1abf65388b4ef39019d35c84a61b1190835fe4e8 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 28 Feb 2023 14:35:32 -0500 Subject: [PATCH 265/496] Drop Kokkos memory oder classes --- core/src/Kokkos_Atomics_Desul_Wrapper.hpp | 25 ----- core/src/impl/Kokkos_Atomic_Memory_Order.hpp | 105 ------------------- 2 files changed, 130 deletions(-) delete mode 100644 core/src/impl/Kokkos_Atomic_Memory_Order.hpp diff --git a/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index 519ed3eb89..bda3783980 100644 --- a/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -26,7 +26,6 @@ static_assert(false, #include #include -#include #include // clang-format off @@ -228,30 +227,6 @@ T atomic_compare_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter } namespace Impl { - - template - struct KokkosToDesulMemoryOrder; - - template<> - struct KokkosToDesulMemoryOrder { - using type = desul::MemoryOrderSeqCst; - }; - template<> - struct KokkosToDesulMemoryOrder { - using type = desul::MemoryOrderAcquire; - }; - template<> - struct KokkosToDesulMemoryOrder { - using type = desul::MemoryOrderRelease; - }; - template<> - struct KokkosToDesulMemoryOrder { - using type = desul::MemoryOrderAcqRel; - }; - template<> - struct KokkosToDesulMemoryOrder { - using type = desul::MemoryOrderRelaxed; - }; template KOKKOS_INLINE_FUNCTION bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess succ, MemOrderFailure fail) { return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, fail, KOKKOS_DESUL_MEM_SCOPE); diff --git a/core/src/impl/Kokkos_Atomic_Memory_Order.hpp b/core/src/impl/Kokkos_Atomic_Memory_Order.hpp deleted file mode 100644 index 6d1bfb9c82..0000000000 --- a/core/src/impl/Kokkos_Atomic_Memory_Order.hpp +++ /dev/null @@ -1,105 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_KOKKOS_ATOMIC_MEMORY_ORDER_HPP -#define KOKKOS_KOKKOS_ATOMIC_MEMORY_ORDER_HPP - -#include - -#include - -namespace Kokkos { -namespace Impl { - -/** @file - * Provides strongly-typed analogs of the standard memory order enumerators. - * In addition to (very slightly) reducing the constant propagation burden on - * the compiler, this allows us to give compile-time errors for things that - * don't make sense, like atomic_load with memory order release. - */ - -struct memory_order_seq_cst_t { - using memory_order = memory_order_seq_cst_t; -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ - defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \ - defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) - static constexpr auto gnu_constant = __ATOMIC_SEQ_CST; -#endif - static constexpr auto std_constant = std::memory_order_seq_cst; -}; -constexpr memory_order_seq_cst_t memory_order_seq_cst = {}; - -struct memory_order_relaxed_t { - using memory_order = memory_order_relaxed_t; -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ - defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \ - defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) - static constexpr auto gnu_constant = __ATOMIC_RELAXED; -#endif - static constexpr auto std_constant = std::memory_order_relaxed; -}; - -// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with -// the OpenMPTarget backend -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL) -#pragma omp declare target -#endif - -constexpr memory_order_relaxed_t memory_order_relaxed = {}; - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL) -#pragma omp end declare target -#endif - -struct memory_order_acquire_t { - using memory_order = memory_order_acquire_t; -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ - defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \ - defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) - static constexpr auto gnu_constant = __ATOMIC_ACQUIRE; -#endif - static constexpr auto std_constant = std::memory_order_acquire; -}; -constexpr memory_order_acquire_t memory_order_acquire = {}; - -struct memory_order_release_t { - using memory_order = memory_order_release_t; -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ - defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \ - defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) - static constexpr auto gnu_constant = __ATOMIC_RELEASE; -#endif - static constexpr auto std_constant = std::memory_order_release; -}; -constexpr memory_order_release_t memory_order_release = {}; - -struct memory_order_acq_rel_t { - using memory_order = memory_order_acq_rel_t; -#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ - defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \ - defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) - static constexpr auto gnu_constant = __ATOMIC_ACQ_REL; -#endif - static constexpr auto std_constant = std::memory_order_acq_rel; -}; -constexpr memory_order_acq_rel_t memory_order_acq_rel = {}; - -// Intentionally omit consume (for now) - -} // end namespace Impl -} // end namespace Kokkos - -#endif // KOKKOS_KOKKOS_ATOMIC_MEMORY_ORDER_HPP From 569a60949aa5dc3e62c9830f16574d8ec3284ffe Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 28 Feb 2023 22:20:41 -0500 Subject: [PATCH 266/496] Export `Kokkos_CUDA_ARCHITECTURES` variable with CMake (#5919) * Export Kokkos_CUDA_ARCHITECTURES variable with CMake * Fixup set KOKKOS_CUDA_ARCHITECTURES in the parent scope as well --- cmake/KokkosConfigCommon.cmake.in | 4 ++++ cmake/kokkos_arch.cmake | 8 ++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/cmake/KokkosConfigCommon.cmake.in b/cmake/KokkosConfigCommon.cmake.in index bb5ce5ff81..b4eb684928 100644 --- a/cmake/KokkosConfigCommon.cmake.in +++ b/cmake/KokkosConfigCommon.cmake.in @@ -11,6 +11,10 @@ FOREACH(DEV ${Kokkos_DEVICES}) SET(Kokkos_ENABLE_${DEV} ON) ENDFOREACH() +IF(Kokkos_ENABLE_CUDA) + SET(Kokkos_CUDA_ARCHITECTURES @KOKKOS_CUDA_ARCHITECTURES@) +ENDIF() + IF(NOT Kokkos_FIND_QUIETLY) MESSAGE(STATUS "Enabled Kokkos devices: ${Kokkos_DEVICES}") ENDIF() diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index e320d204da..a3904889d6 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -571,10 +571,14 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) ELSE() + IF(KOKKOS_ENABLE_CUDA) + STRING(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) + SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) + SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) + ENDIF() SET(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) IF(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - string(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) - SET(CMAKE_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) + SET(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) ELSE() IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${FLAG}) From b9e423e3b0e54e97655fe375684c0e99234378b0 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 28 Feb 2023 22:38:21 -0500 Subject: [PATCH 267/496] Export Kokkos_HIP_ARCHITECTURES variable with CMake --- cmake/KokkosConfigCommon.cmake.in | 4 ++++ cmake/kokkos_arch.cmake | 3 +++ 2 files changed, 7 insertions(+) diff --git a/cmake/KokkosConfigCommon.cmake.in b/cmake/KokkosConfigCommon.cmake.in index b4eb684928..63949e5613 100644 --- a/cmake/KokkosConfigCommon.cmake.in +++ b/cmake/KokkosConfigCommon.cmake.in @@ -15,6 +15,10 @@ IF(Kokkos_ENABLE_CUDA) SET(Kokkos_CUDA_ARCHITECTURES @KOKKOS_CUDA_ARCHITECTURES@) ENDIF() +IF(Kokkos_ENABLE_HIP) + SET(Kokkos_HIP_ARCHITECTURES @KOKKOS_HIP_ARCHITECTURES@) +ENDIF() + IF(NOT Kokkos_FIND_QUIETLY) MESSAGE(STATUS "Enabled Kokkos devices: ${Kokkos_DEVICES}") ENDIF() diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index a3904889d6..2187f99352 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -629,6 +629,9 @@ FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) ELSE() + IF(KOKKOS_ENABLE_HIP) + SET(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) + ENDIF() SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) From 310812b0d4e60539555287c56ad10247000e47fd Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Mar 2023 08:50:38 -0500 Subject: [PATCH 268/496] Remove extra double quote in CUDA and HIP allocation error messages (#5926) * Remove extra double quote in CUDA allocation error msg * Remove extra double quote in HIP allocation error msg --- core/src/HIP/Kokkos_HIP_Error.hpp | 2 +- core/src/impl/Kokkos_Error.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Error.hpp b/core/src/HIP/Kokkos_HIP_Error.hpp index e09382c705..43d63c090b 100644 --- a/core/src/HIP/Kokkos_HIP_Error.hpp +++ b/core/src/HIP/Kokkos_HIP_Error.hpp @@ -70,7 +70,7 @@ class HIPRawMemoryAllocationFailure : public RawMemoryAllocationFailure { void append_additional_error_information(std::ostream& o) const override { if (m_error_code != hipSuccess) { - o << " The HIP allocation returned the error code \"\"" + o << " The HIP allocation returned the error code \"" << hipGetErrorName(m_error_code) << "\"."; } } diff --git a/core/src/impl/Kokkos_Error.cpp b/core/src/impl/Kokkos_Error.cpp index efd0fb998e..5b4e1a7770 100644 --- a/core/src/impl/Kokkos_Error.cpp +++ b/core/src/impl/Kokkos_Error.cpp @@ -151,7 +151,7 @@ namespace Experimental { void CudaRawMemoryAllocationFailure::append_additional_error_information( std::ostream &o) const { if (m_error_code != cudaSuccess) { - o << " The Cuda allocation returned the error code \"\"" + o << " The Cuda allocation returned the error code \"" << cudaGetErrorName(m_error_code) << "\"."; } } From 5f8d0e3d2361f8ca9431d56ccbe9aa7487df8c6a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Mar 2023 10:43:48 -0500 Subject: [PATCH 269/496] Update clang-format CI build (#5930) * Let clang-format build run on any node that has docker * Use ubuntu:18.04 as the base image for the clang-format CI build * Fixup LLVm download URL * Install build-essential (need gpg and xz) --- .jenkins | 2 +- scripts/docker/Dockerfile.clang | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.jenkins b/.jenkins index 3a1dfaab7d..b7591d1a23 100644 --- a/.jenkins +++ b/.jenkins @@ -17,7 +17,7 @@ pipeline { dockerfile { filename 'Dockerfile.clang' dir 'scripts/docker' - label 'nvidia-docker || docker' + label 'nvidia-docker || rocm-docker || docker' args '-v /tmp/ccache.kokkos:/tmp/ccache' } } diff --git a/scripts/docker/Dockerfile.clang b/scripts/docker/Dockerfile.clang index 9df93b5754..5c6abc1c6d 100644 --- a/scripts/docker/Dockerfile.clang +++ b/scripts/docker/Dockerfile.clang @@ -1,8 +1,9 @@ -FROM nvidia/cuda:9.2-devel +FROM ubuntu:18.04 RUN apt-get update && apt-get install -y \ bc \ git \ + build-essential \ wget \ ccache \ && \ @@ -34,7 +35,7 @@ ENV PATH=${CMAKE_DIR}/bin:$PATH ENV LLVM_DIR=/opt/llvm RUN LLVM_VERSION=8.0.0 && \ - LLVM_URL=http://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \ + LLVM_URL=https://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04.tar.xz && \ LLVM_ARCHIVE=llvm-${LLVM_VERSION}.tar.xz && \ SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ wget --quiet ${LLVM_URL} --output-document=${LLVM_ARCHIVE} && \ From 2bbe1dfc4b28cbe7d3e86f28f08215016209ecdc Mon Sep 17 00:00:00 2001 From: tcclevenger Date: Wed, 1 Mar 2023 09:35:01 -0700 Subject: [PATCH 270/496] Cleanup unit_test/CMakeLists.txt - Reorder lists in alphabetical order - Remove some nonexistent tests from openacc/openmptarget remove lists --- core/unit_test/CMakeLists.txt | 215 ++++++++++++++++------------------ 1 file changed, 100 insertions(+), 115 deletions(-) diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index d5041442aa..543f596b91 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -113,41 +113,39 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) # file. That then exceeded the shell command line max length. set(${Tag}_SOURCES1A) foreach(Name - BitManipulationBuiltins + Abort + AtomicOperations_complexdouble + AtomicOperations_complexfloat + AtomicOperations_double + AtomicOperations_float AtomicOperations_int - AtomicOperations_unsignedint AtomicOperations_longint - AtomicOperations_unsignedlongint AtomicOperations_longlongint - AtomicOperations_double - AtomicOperations_float - AtomicOperations_complexdouble - AtomicOperations_complexfloat AtomicOperations_shared - AtomicViews + AtomicOperations_unsignedint + AtomicOperations_unsignedlongint Atomics + AtomicViews + BitManipulationBuiltins BlockSizeDeduction - Concepts + CommonPolicyConstructors + CommonPolicyInterface Complex + Concepts Crs DeepCopyAlignment + ExecSpacePartitioning ExecutionSpace FunctorAnalysis + HostSharedPtr + HostSharedPtrAccessOnDevice Init JoinBackwardCompatibility LocalDeepCopy - MinMaxClamp MathematicalConstants MathematicalFunctions1 MathematicalFunctions2 MathematicalFunctions3 - MDRange_a - MDRange_b - MDRange_c - HostSharedPtr - HostSharedPtrAccessOnDevice - QuadPrecisionMath - ExecSpacePartitioning MathematicalSpecialFunctions ) set(file ${dir}/Test${Tag}_${Name}.cpp) @@ -163,31 +161,32 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) set(${Tag}_SOURCES1B) foreach(Name + MDRange_a + MDRange_b + MDRange_c MDRange_d MDRange_e MDRange_f MDRange_g + MDRangePolicyConstructors + MDSpan + MinMaxClamp NumericTraits Other + QuadPrecisionMath RangePolicy - RangePolicyRequire - CommonPolicyConstructors RangePolicyConstructors - TeamPolicyConstructors - MDRangePolicyConstructors - CommonPolicyInterface - Reductions + RangePolicyRequire + ReducerCTADs Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e - ReducerCTADs + Reductions Reductions_DeviceView Scan SharedAlloc - TeamMDRange - ViewMapping_a ) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid @@ -202,15 +201,16 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) SET(${Tag}_SOURCES2A) foreach(Name - Abort - MDSpan TeamBasic + TeamMDRange + TeamPolicyConstructors TeamReductionScan TeamScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken + View_64bit ViewAPI_a ViewAPI_b ViewAPI_c @@ -221,12 +221,12 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewCtorDimMatch ViewHooks ViewLayoutStrideAssignment + ViewMapping_a ViewMapping_b ViewMapping_subview ViewMemoryAccessViolation ViewOfClass ViewResize - View_64bit WorkGraph WithoutInitializing ) @@ -362,27 +362,27 @@ endif() if(Kokkos_ENABLE_OPENACC) list(REMOVE_ITEM OpenACC_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexdouble.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyConstructors.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyInterface.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Crs.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_JoinBackwardCompatibility.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_LocalDeepCopy.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Other.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamMDRange.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamReductionScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScan.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_e.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewCopy_a.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewCopy_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_subview.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewOfClass.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_WorkGraph.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_JoinBackwardCompatibility.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyInterface.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamMDRange.cpp ) endif() @@ -407,43 +407,41 @@ ENDIF() # FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++ IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) list(REMOVE_ITEM OpenMPTarget_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce_dynamic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce_dynamic_view.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_double_reduce.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_double_reduce_dynamic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtr.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtrAccessOnDevice.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScratch.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TestScan.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TestTeamScan.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TestTeamReductionScan.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Atomics.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_double.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_float.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_int.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_longint.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_longlongint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_double.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_unsignedint.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_unsignedlongint.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Atomics.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicViews.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_BlockSizeDeduction.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_CommonPolicyConstructors.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_CommonPolicyInterface.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_DeepCopyAlignment.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtr.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtrAccessOnDevice.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MathematicalFunctions.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_a.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_b.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_c.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_d.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRangePolicyConstructors.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_NumericTraits.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicy.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicyConstructors.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicyRequire.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_a.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_c.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_e.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewMapping_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Scan.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_NumericTraits.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_DeepCopyAlignment.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MathematicalFunctions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp @@ -458,72 +456,68 @@ IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c11.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c12.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c13.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_a.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_c.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_d.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamPolicyConstructors.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScratch.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_a.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_c.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_d.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_f.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewMapping_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewResize.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicyRequire.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicy.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_CommonPolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamPolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRangePolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_CommonPolicyInterface.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp ) endif() # FIXME_OPENACC - Comment non-passing tests with the NVIDIA HPC compiler nvc++ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) list(REMOVE_ITEM OpenACC_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_int64_t_reduce.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_int64_t_reduce_dynamic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_int64_t_reduce_dynamic_view.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_double_reduce.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_double_reduce_dynamic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtrAccessOnDevice.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TestTeamScan.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TestTeamReductionScan.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_float.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_int.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longint.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longlongint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_shared.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedint.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedlongint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_shared.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyConstructors.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyInterface.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtrAccessOnDevice.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions1.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions2.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions3.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_a.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_b.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_c.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_d.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_e.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_f.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_g.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_NumericTraits.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicy.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyRequire.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reduce.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_a.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_c.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_e.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_NumericTraits.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions1.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions2.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions3.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_a.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c01.cpp @@ -539,28 +533,19 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c12.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c13.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_a.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_c.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_d.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_e.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_f.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_g.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_a.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_c.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_d.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_f.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyRequire.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicy.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyInterface.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp ) endif() From b132b9b907533d9e013869b0a0975cf2d3edf6a9 Mon Sep 17 00:00:00 2001 From: Samuel Li Date: Wed, 1 Mar 2023 09:57:04 -0700 Subject: [PATCH 271/496] add cbegin() and cend() to Kokkos::Vector --- containers/src/Kokkos_Vector.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/containers/src/Kokkos_Vector.hpp b/containers/src/Kokkos_Vector.hpp index 29bb15f5ea..9c9cde3a55 100644 --- a/containers/src/Kokkos_Vector.hpp +++ b/containers/src/Kokkos_Vector.hpp @@ -196,10 +196,16 @@ class vector : public DualView { iterator begin() const { return DV::h_view.data(); } + const_iterator cbegin() const { return DV::h_view.data(); } + iterator end() const { return _size > 0 ? DV::h_view.data() + _size : DV::h_view.data(); } + const_iterator cend() const { + return _size > 0 ? DV::h_view.data() + _size : DV::h_view.data(); + } + reference front() { return DV::h_view(0); } reference back() { return DV::h_view(_size - 1); } From f419b7303d5a96dfe9450d691459a7af3789b594 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Mar 2023 12:20:14 -0500 Subject: [PATCH 272/496] Add missing header include --- core/src/HIP/Kokkos_HIP_Instance.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp index 51b3f79a9d..ee0fd9f726 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -22,6 +22,7 @@ #include #include +#include #include namespace Kokkos { From 61620e83e6c4b322728ae6dfa214e2a512c6a5bb Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Wed, 1 Mar 2023 13:30:31 -0700 Subject: [PATCH 273/496] Revert "Revert "Fix intel hang"" The workaround was specifically for a hang in icpc 19.0.5 compiling with -O3. It resurfaced in nightly testing. Fixes #5933 This reverts commit 33e5ef694ef4cd11eae1f4dd931d5d56cd84c3e3. --- core/unit_test/TestMDRange.hpp | 118 +++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/core/unit_test/TestMDRange.hpp b/core/unit_test/TestMDRange.hpp index e48319dbd7..3e80e7a01b 100644 --- a/core/unit_test/TestMDRange.hpp +++ b/core/unit_test/TestMDRange.hpp @@ -2708,11 +2708,18 @@ struct TestMDRange_6D { const int N3, const int N4, const int N5) { #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = + typename Kokkos::MDRangePolicy, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; +#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -2731,11 +2738,18 @@ struct TestMDRange_6D { #endif { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = + typename Kokkos::MDRangePolicy, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; +#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -2768,11 +2782,18 @@ struct TestMDRange_6D { // Test with reducers - scalar { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = + typename Kokkos::MDRangePolicy, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; +#endif #ifdef KOKKOS_ENABLE_SYCL range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}}, {{3, 3, 3, 2, 2, 2}}); @@ -2795,11 +2816,18 @@ struct TestMDRange_6D { // Test with reducers - scalar + label { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = + typename Kokkos::MDRangePolicy, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; +#endif #ifdef KOKKOS_ENABLE_SYCL range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}}, @@ -2823,12 +2851,19 @@ struct TestMDRange_6D { // Test with reducers - scalar view { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = + typename Kokkos::MDRangePolicy, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy, Kokkos::IndexType, Kokkos::LaunchBounds<512, 1>>; +#endif #ifdef KOKKOS_ENABLE_SYCL range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}}, {{3, 3, 3, 2, 2, 2}}); @@ -2856,11 +2891,18 @@ struct TestMDRange_6D { // Test Min reducer with lambda #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = + typename Kokkos::MDRangePolicy, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; +#endif range_type range({{1, 1, 1, 1, 1, 1}}, {{N0, N1, N2, N3, N4, N5}}, {{3, 3, 3, 2, 2, 1}}); @@ -2893,12 +2935,19 @@ struct TestMDRange_6D { // Tagged operator test { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>, + Kokkos::IndexType, InitTag>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6, Iterate::Default, Iterate::Default>, Kokkos::IndexType, InitTag>; +#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -2950,11 +2999,18 @@ struct TestMDRange_6D { const int N4, const int N5) { #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = + typename Kokkos::MDRangePolicy, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; +#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3003,10 +3059,16 @@ struct TestMDRange_6D { #endif { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = + typename Kokkos::MDRangePolicy>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>>; +#endif using point_type = typename range_type::point_type; range_type range(point_type{{0, 0, 0, 0, 0, 0}}, @@ -3039,11 +3101,18 @@ struct TestMDRange_6D { } { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = + typename Kokkos::MDRangePolicy, + Kokkos::IndexType, InitTag>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>, Kokkos::IndexType, InitTag>; +#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3094,11 +3163,18 @@ struct TestMDRange_6D { } { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = + typename Kokkos::MDRangePolicy, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; +#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3139,12 +3215,19 @@ struct TestMDRange_6D { } { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6, Iterate::Default, Iterate::Default>, Kokkos::IndexType>; +#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3185,12 +3268,19 @@ struct TestMDRange_6D { } { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Left>, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6, Iterate::Left, Iterate::Left>, Kokkos::IndexType>; +#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3231,12 +3321,19 @@ struct TestMDRange_6D { } { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Right>, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6, Iterate::Left, Iterate::Right>, Kokkos::IndexType>; +#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3277,12 +3374,19 @@ struct TestMDRange_6D { } { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Left>, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6, Iterate::Right, Iterate::Left>, Kokkos::IndexType>; +#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3323,12 +3427,19 @@ struct TestMDRange_6D { } { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Right>, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6, Iterate::Right, Iterate::Right>, Kokkos::IndexType>; +#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; @@ -3679,11 +3790,18 @@ struct TestMDRange_6D_NegIdx { static void test_6D_negidx(const int N0, const int N1, const int N2, const int N3, const int N4, const int N5) { { +#if defined(KOKKOS_COMPILER_INTEL) + // Launchbounds causes hang with intel compilers + using range_type = + typename Kokkos::MDRangePolicy, + Kokkos::IndexType>; +#else // Launchbounds to ensure the tile fits into a CUDA block under register // constraints using range_type = typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::LaunchBounds<256, 1>, Kokkos::Rank<6>, Kokkos::IndexType>; +#endif using tile_type = typename range_type::tile_type; using point_type = typename range_type::point_type; From 0e9990230ff0c253cb3442064c14c9a28a2c15a6 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Mar 2023 22:01:56 -0500 Subject: [PATCH 274/496] Do not define KOKKOS_ARCH_AMPERE with Ada (compute capability 8.9) --- Makefile.kokkos | 1 - cmake/kokkos_arch.cmake | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index d51d023d56..39064650d0 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1052,7 +1052,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_89 endif diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index e0f508b99c..eeace546fc 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -868,7 +868,7 @@ IF (KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) SET(KOKKOS_ARCH_VOLTA ON) ENDIF() -IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86 OR KOKKOS_ARCH_ADA89) +IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) SET(KOKKOS_ARCH_AMPERE ON) ENDIF() From 6dd4800e6ae1c73c8da401f238f01ab1fcc53f86 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Mar 2023 22:02:55 -0500 Subject: [PATCH 275/496] Add KOKKOS_ARCH_ADA89 to print_configuration --- core/src/impl/Kokkos_Core.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 5e53e42659..30448d1aa4 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -743,6 +743,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_AMPERE86) declare_configuration_metadata("architecture", "GPU architecture", "AMPERE86"); +#elif defined(KOKKOS_ARCH_ADA89) + declare_configuration_metadata("architecture", "GPU architecture", "ADA89"); #elif defined(KOKKOS_ARCH_HOPPER90) declare_configuration_metadata("architecture", "GPU architecture", "HOPPER90"); From 537f62e5ee2a74efffa3a66395d94803e6544b84 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Mar 2023 22:07:08 -0500 Subject: [PATCH 276/496] Do not define KOKKOS_ARCH_TURING macro with generated GNU makefiles --- Makefile.kokkos | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 25080c66e3..a885a640ab 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1035,7 +1035,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75 endif From aafe20c99346d8f0e4ac5925cca314fb7b11d1ed Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 28 Feb 2023 15:15:43 -0500 Subject: [PATCH 277/496] Drop `KOKKOS_ENABLE_*_ATOMICS` macros when printing configuration --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 8 -------- core/src/OpenMP/Kokkos_OpenMP.cpp | 8 -------- core/src/Serial/Kokkos_Serial.cpp | 8 -------- core/src/impl/Kokkos_Core.cpp | 20 -------------------- 4 files changed, 44 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 32102d6ad1..cb88c5adf1 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -867,14 +867,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << "Device Execution Space:\n"; os << " KOKKOS_ENABLE_CUDA: yes\n"; - os << "Cuda Atomics:\n"; - os << " KOKKOS_ENABLE_CUDA_ATOMICS: "; -#ifdef KOKKOS_ENABLE_CUDA_ATOMICS - os << "yes\n"; -#else - os << "no\n"; -#endif - os << "Cuda Options:\n"; os << " KOKKOS_ENABLE_CUDA_LAMBDA: "; #ifdef KOKKOS_ENABLE_CUDA_LAMBDA diff --git a/core/src/OpenMP/Kokkos_OpenMP.cpp b/core/src/OpenMP/Kokkos_OpenMP.cpp index 4ad5238654..aa185a0bc0 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -57,14 +57,6 @@ void OpenMP::print_configuration(std::ostream &os, bool /*verbose*/) const { os << "Host Parallel Execution Space:\n"; os << " KOKKOS_ENABLE_OPENMP: yes\n"; - os << "OpenMP Atomics:\n"; - os << " KOKKOS_ENABLE_OPENMP_ATOMICS: "; -#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS - os << "yes\n"; -#else - os << "no\n"; -#endif - os << "\nOpenMP Runtime Configuration:\n"; m_space_instance->print_configuration(os); diff --git a/core/src/Serial/Kokkos_Serial.cpp b/core/src/Serial/Kokkos_Serial.cpp index 52e3d68bdf..3d4d92e2d5 100644 --- a/core/src/Serial/Kokkos_Serial.cpp +++ b/core/src/Serial/Kokkos_Serial.cpp @@ -149,14 +149,6 @@ void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const { os << "Host Serial Execution Space:\n"; os << " KOKKOS_ENABLE_SERIAL: yes\n"; - os << "Serial Atomics:\n"; - os << " KOKKOS_ENABLE_SERIAL_ATOMICS: "; -#ifdef KOKKOS_ENABLE_SERIAL_ATOMICS - os << "yes\n"; -#else - os << "no\n"; -#endif - os << "\nSerial Runtime Configuration:\n"; } diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 5e53e42659..26d45057b8 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -516,26 +516,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("tools_only", "compiler_family", "msvc"); #endif -#ifdef KOKKOS_ENABLE_GNU_ATOMICS - declare_configuration_metadata("atomics", "KOKKOS_ENABLE_GNU_ATOMICS", "yes"); -#else - declare_configuration_metadata("atomics", "KOKKOS_ENABLE_GNU_ATOMICS", "no"); -#endif -#ifdef KOKKOS_ENABLE_INTEL_ATOMICS - declare_configuration_metadata("atomics", "KOKKOS_ENABLE_INTEL_ATOMICS", - "yes"); -#else - declare_configuration_metadata("atomics", "KOKKOS_ENABLE_INTEL_ATOMICS", - "no"); -#endif -#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS - declare_configuration_metadata("atomics", "KOKKOS_ENABLE_WINDOWS_ATOMICS", - "yes"); -#else - declare_configuration_metadata("atomics", "KOKKOS_ENABLE_WINDOWS_ATOMICS", - "no"); -#endif - #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_IVDEP", "yes"); From 6d10edce73e608fe12bd6e42b64d0a455bd4c6fe Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 28 Feb 2023 15:18:17 -0500 Subject: [PATCH 278/496] Drop KOKKOS_ENABLE_CUDA_ASM* macros --- core/src/setup/Kokkos_Setup_Cuda.hpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/core/src/setup/Kokkos_Setup_Cuda.hpp b/core/src/setup/Kokkos_Setup_Cuda.hpp index d774914d9f..c57f690ae1 100644 --- a/core/src/setup/Kokkos_Setup_Cuda.hpp +++ b/core/src/setup/Kokkos_Setup_Cuda.hpp @@ -62,19 +62,6 @@ #undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA #endif // !defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) -// PTX atomics with memory order semantics are only available on volta and later -#if !defined(KOKKOS_DISABLE_CUDA_ASM) -#if !defined(KOKKOS_ENABLE_CUDA_ASM) -#define KOKKOS_ENABLE_CUDA_ASM -#if !defined(KOKKOS_DISABLE_CUDA_ASM_ATOMICS) && \ - defined(KOKKOS_ENABLE_GNU_ATOMICS) -#define KOKKOS_ENABLE_CUDA_ASM_ATOMICS -#endif -#endif -#endif -#endif - #define KOKKOS_IMPL_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__ #define KOKKOS_IMPL_FORCEINLINE __forceinline__ #define KOKKOS_IMPL_INLINE_FUNCTION __device__ __host__ inline From e69b7969ae9f88589dc6cb4c41d00028b0eeb053 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 28 Feb 2023 15:19:23 -0500 Subject: [PATCH 279/496] Remove mention of the KOKKOS_ENABLE_*_ATOMICS macros in header --- core/src/Kokkos_Macros.hpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 4f5bda88ed..901f84367b 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -71,13 +71,6 @@ * KOKKOS_COMPILER_NVHPC * KOKKOS_COMPILER_MSVC * - * Macros for which compiler extension to use for atomics on intrinsic types - * - * KOKKOS_ENABLE_CUDA_ATOMICS - * KOKKOS_ENABLE_GNU_ATOMICS - * KOKKOS_ENABLE_INTEL_ATOMICS - * KOKKOS_ENABLE_OPENMP_ATOMICS - * * A suite of 'KOKKOS_ENABLE_PRAGMA_...' are defined for internal use. * * Macros for marking functions to run in an execution space: From 4bf2c5c5ffef8c3b770423bb70adaed77ddca2cc Mon Sep 17 00:00:00 2001 From: tcclevenger Date: Mon, 16 Jan 2023 08:05:29 -0700 Subject: [PATCH 280/496] RangePolicyRequire was not using require --- core/unit_test/TestRangePolicyRequire.hpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/core/unit_test/TestRangePolicyRequire.hpp b/core/unit_test/TestRangePolicyRequire.hpp index 749f8b97d2..5e578b2903 100644 --- a/core/unit_test/TestRangePolicyRequire.hpp +++ b/core/unit_test/TestRangePolicyRequire.hpp @@ -215,18 +215,25 @@ struct TestRangeRequire { //---------------------------------------- void test_scan() { - Kokkos::parallel_for(Kokkos::RangePolicy(0, N), - *this); + Kokkos::parallel_for( + Kokkos::Experimental::require( + Kokkos::RangePolicy(0, N), Property()), + *this); Kokkos::parallel_scan( "TestKernelScan", - Kokkos::RangePolicy(0, N), *this); + Kokkos::Experimental::require( + Kokkos::RangePolicy(0, N), + Property()), + *this); int total = 0; Kokkos::parallel_scan( "TestKernelScanWithTotal", - Kokkos::RangePolicy(0, N), *this, - total); + Kokkos::Experimental::require( + Kokkos::RangePolicy(0, N), + Property()), + *this, total); ASSERT_EQ(size_t((N - 1) * (N) / 2), size_t(total)); // sum( 0 .. N-1 ) } From 5c5ac72dabe3403448956b1f525d2bc7741719e2 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Mar 2023 10:42:55 -0500 Subject: [PATCH 281/496] Tell when Kokkos atomics are disabled in print_configuration --- core/src/Serial/Kokkos_Serial.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/src/Serial/Kokkos_Serial.cpp b/core/src/Serial/Kokkos_Serial.cpp index 3d4d92e2d5..6d55dffb7c 100644 --- a/core/src/Serial/Kokkos_Serial.cpp +++ b/core/src/Serial/Kokkos_Serial.cpp @@ -149,6 +149,10 @@ void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const { os << "Host Serial Execution Space:\n"; os << " KOKKOS_ENABLE_SERIAL: yes\n"; +#ifdef KOKKOS_INTERNAL_NOT_PARALLEL + os << "Kokkos atomics disabled\n"; +#endif + os << "\nSerial Runtime Configuration:\n"; } From 40c40a7c4b4300e8ae40bb3637082128ffaca78c Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 2 Mar 2023 17:07:07 +0100 Subject: [PATCH 282/496] Convert OpenMP ParallelReduce (#5893) * Convert OpenMP ParallelReduce * Reax memory space access assumption for OpenMP ParalelReduce * Improve wording in static_assert Co-authored-by: Bruno Turcksin --------- Co-authored-by: Bruno Turcksin --- core/src/Kokkos_Parallel_Reduce.hpp | 5 + core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp | 306 +++++++-------------- 2 files changed, 101 insertions(+), 210 deletions(-) diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index 90b367fec8..86951d0797 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -1427,6 +1427,11 @@ struct implements_new_reduce_interface : std::true_type {}; #endif +#ifdef KOKKOS_ENABLE_OPENMP +template <> +struct implements_new_reduce_interface : std::true_type {}; +#endif + #ifdef KOKKOS_ENABLE_CUDA template <> struct implements_new_reduce_interface : std::true_type {}; diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp index f4f8c99733..8cd05e7199 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp @@ -292,35 +292,24 @@ class ParallelFor, namespace Kokkos { namespace Impl { -template -class ParallelReduce, ReducerType, +template +class ParallelReduce, Kokkos::OpenMP> { private: - using Policy = Kokkos::RangePolicy; + using Policy = Kokkos::RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; using WorkTag = typename Policy::work_tag; using WorkRange = typename Policy::WorkRange; using Member = typename Policy::member_type; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - std::conditional_t::value, WorkTag, - void>; - - // Static Assert WorkTag void if ReducerType not InvalidType - using Analysis = - FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; OpenMPInternal* m_instance; - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; template @@ -344,13 +333,12 @@ class ParallelReduce, ReducerType, public: inline void execute() const { - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); + const ReducerType& reducer = m_functor_reducer.get_reducer(); if (m_policy.end() <= m_policy.begin()) { if (m_result_ptr) { - final_reducer.init(m_result_ptr); - final_reducer.final(m_result_ptr); + reducer.init(m_result_ptr); + reducer.final(m_result_ptr); } return; } @@ -359,8 +347,7 @@ class ParallelReduce, ReducerType, Kokkos::Dynamic>::value }; - const size_t pool_reduce_bytes = - Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + const size_t pool_reduce_bytes = reducer.value_size(); m_instance->acquire_lock(); @@ -378,12 +365,13 @@ class ParallelReduce, ReducerType, : pointer_type( m_instance->get_thread_data(0)->pool_reduce_local()); - reference_type update = final_reducer.init(ptr); + reference_type update = reducer.init(ptr); - ParallelReduce::template exec_range(m_functor, m_policy.begin(), - m_policy.end(), update); + ParallelReduce::template exec_range( + m_functor_reducer.get_functor(), m_policy.begin(), m_policy.end(), + update); - final_reducer.final(ptr); + reducer.final(ptr); return; } const int pool_size = m_instance->thread_pool_size(); @@ -399,7 +387,7 @@ class ParallelReduce, ReducerType, if (data.pool_rendezvous()) data.pool_rendezvous_release(); } - reference_type update = final_reducer.init( + reference_type update = reducer.init( reinterpret_cast(data.pool_reduce_local())); std::pair range(0, 0); @@ -409,7 +397,7 @@ class ParallelReduce, ReducerType, : data.get_work_partition(); ParallelReduce::template exec_range( - m_functor, range.first + m_policy.begin(), + m_functor_reducer.get_functor(), range.first + m_policy.begin(), range.second + m_policy.begin(), update); } while (is_dynamic && 0 <= range.first); @@ -421,16 +409,15 @@ class ParallelReduce, ReducerType, pointer_type(m_instance->get_thread_data(0)->pool_reduce_local()); for (int i = 1; i < pool_size; ++i) { - final_reducer.join( - ptr, reinterpret_cast( - m_instance->get_thread_data(i)->pool_reduce_local())); + reducer.join(ptr, + reinterpret_cast( + m_instance->get_thread_data(i)->pool_reduce_local())); } - final_reducer.final(ptr); + reducer.final(ptr); if (m_result_ptr) { - const int n = Analysis::value_count( - ReducerConditional::select(m_functor, m_reducer)); + const int n = reducer.value_count(); for (int j = 0; j < n; ++j) { m_result_ptr[j] = ptr[j]; @@ -443,16 +430,11 @@ class ParallelReduce, ReducerType, //---------------------------------------- template - inline ParallelReduce( - const FunctorType& arg_functor, Policy arg_policy, - const ViewType& arg_view, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = nullptr) + inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + Policy arg_policy, const ViewType& arg_view) : m_instance(nullptr), - m_functor(arg_functor), + m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_view.data()) { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 if (t_openmp_instance) { @@ -463,61 +445,31 @@ class ParallelReduce, ReducerType, #else m_instance = arg_policy.space().impl_internal_space_instance(); #endif - /*static_assert( std::is_same< typename ViewType::memory_space - , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" - );*/ - } - - inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, - const ReducerType& reducer) - : m_instance(nullptr), - m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else - m_instance = arg_policy.space().impl_internal_space_instance(); -#endif - /*static_assert( std::is_same< typename ViewType::memory_space - , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" - );*/ + static_assert( + Kokkos::Impl::MemorySpaceAccess::accessible, + "Kokkos::OpenMP reduce result must be a View accessible from " + "HostSpace"); } }; // MDRangePolicy impl -template -class ParallelReduce, ReducerType, - Kokkos::OpenMP> { +template +class ParallelReduce, Kokkos::OpenMP> { private: using MDRangePolicy = Kokkos::MDRangePolicy; using Policy = typename MDRangePolicy::impl_range_policy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; using WorkTag = typename MDRangePolicy::work_tag; using WorkRange = typename Policy::WorkRange; using Member = typename Policy::member_type; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - std::conditional_t::value, WorkTag, - void>; - - using Analysis = FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using value_type = typename Analysis::value_type; - using reference_type = typename Analysis::reference_type; + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; using iterate_type = typename Kokkos::Impl::HostIterateTile, ReducerType, public: inline void execute() const { - const size_t pool_reduce_bytes = Analysis::value_size( - ReducerConditional::select(m_iter.m_func, m_reducer)); + const size_t pool_reduce_bytes = m_reducer.value_size(); m_instance->acquire_lock(); @@ -549,9 +500,6 @@ class ParallelReduce, ReducerType, 0 // thread_local_bytes ); - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_iter.m_func, m_reducer)); - #ifndef KOKKOS_COMPILER_INTEL if (execute_in_serial(m_iter.m_rp.space())) { const pointer_type ptr = @@ -560,11 +508,11 @@ class ParallelReduce, ReducerType, : pointer_type( m_instance->get_thread_data(0)->pool_reduce_local()); - reference_type update = final_reducer.init(ptr); + reference_type update = m_reducer.init(ptr); ParallelReduce::exec_range(0, m_iter.m_rp.m_num_tiles, update); - final_reducer.final(ptr); + m_reducer.final(ptr); m_instance->release_lock(); @@ -589,7 +537,7 @@ class ParallelReduce, ReducerType, if (data.pool_rendezvous()) data.pool_rendezvous_release(); } - reference_type update = final_reducer.init( + reference_type update = m_reducer.init( reinterpret_cast(data.pool_reduce_local())); std::pair range(0, 0); @@ -610,16 +558,15 @@ class ParallelReduce, ReducerType, pointer_type(m_instance->get_thread_data(0)->pool_reduce_local()); for (int i = 1; i < pool_size; ++i) { - final_reducer.join( - ptr, reinterpret_cast( - m_instance->get_thread_data(i)->pool_reduce_local())); + m_reducer.join(ptr, + reinterpret_cast( + m_instance->get_thread_data(i)->pool_reduce_local())); } - final_reducer.final(ptr); + m_reducer.final(ptr); if (m_result_ptr) { - const int n = Analysis::value_count( - ReducerConditional::select(m_iter.m_func, m_reducer)); + const int n = m_reducer.value_count(); for (int j = 0; j < n; ++j) { m_result_ptr[j] = ptr[j]; @@ -632,15 +579,11 @@ class ParallelReduce, ReducerType, //---------------------------------------- template - inline ParallelReduce( - const FunctorType& arg_functor, MDRangePolicy arg_policy, - const ViewType& arg_view, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = nullptr) + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + MDRangePolicy arg_policy, const ViewType& arg_view) : m_instance(nullptr), - m_iter(arg_policy, arg_functor), - m_reducer(InvalidType()), + m_iter(arg_policy, arg_functor_reducer.get_functor()), + m_reducer(arg_functor_reducer.get_reducer()), m_result_ptr(arg_view.data()) { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 if (t_openmp_instance) { @@ -651,32 +594,13 @@ class ParallelReduce, ReducerType, #else m_instance = arg_policy.space().impl_internal_space_instance(); #endif - /*static_assert( std::is_same< typename ViewType::memory_space - , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" - );*/ + static_assert( + Kokkos::Impl::MemorySpaceAccess::accessible, + "Kokkos::OpenMP reduce result must be a View accessible from " + "HostSpace"); } - inline ParallelReduce(const FunctorType& arg_functor, - MDRangePolicy arg_policy, const ReducerType& reducer) - : m_instance(nullptr), - m_iter(arg_policy, arg_functor), - m_reducer(reducer), - m_result_ptr(reducer.view().data()) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else - m_instance = arg_policy.space().impl_internal_space_instance(); -#endif - /*static_assert( std::is_same< typename ViewType::memory_space - , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" - );*/ - } template static int max_tile_size_product(const Policy&, const Functor&) { /** @@ -1124,38 +1048,27 @@ class ParallelFor, //---------------------------------------------------------------------------- -template -class ParallelReduce, - ReducerType, Kokkos::OpenMP> { +template +class ParallelReduce, Kokkos::OpenMP> { private: enum { TEAM_REDUCE_SIZE = 512 }; using Policy = Kokkos::Impl::TeamPolicyInternal; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; using WorkTag = typename Policy::work_tag; using SchedTag = typename Policy::schedule_type::type; using Member = typename Policy::member_type; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - std::conditional_t::value, WorkTag, - void>; - - using Analysis = - FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; OpenMPInternal* m_instance; - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; const int m_shmem_size; @@ -1201,19 +1114,17 @@ class ParallelReduce, inline void execute() const { enum { is_dynamic = std::is_same::value }; - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); + const ReducerType& reducer = m_functor_reducer.get_reducer(); if (m_policy.league_size() == 0 || m_policy.team_size() == 0) { if (m_result_ptr) { - final_reducer.init(m_result_ptr); - final_reducer.final(m_result_ptr); + reducer.init(m_result_ptr); + reducer.final(m_result_ptr); } return; } - const size_t pool_reduce_size = - Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + const size_t pool_reduce_size = reducer.value_size(); const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size(); const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1); @@ -1228,14 +1139,14 @@ class ParallelReduce, HostThreadTeamData& data = *(m_instance->get_thread_data()); pointer_type ptr = m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local()); - reference_type update = final_reducer.init(ptr); + reference_type update = reducer.init(ptr); const int league_rank_begin = 0; const int league_rank_end = m_policy.league_size(); ParallelReduce::template exec_team( - m_functor, data, update, league_rank_begin, league_rank_end, - m_policy.league_size()); + m_functor_reducer.get_functor(), data, update, league_rank_begin, + league_rank_end, m_policy.league_size()); - final_reducer.final(ptr); + reducer.final(ptr); m_instance->release_lock(); @@ -1263,7 +1174,7 @@ class ParallelReduce, } if (active) { - reference_type update = final_reducer.init( + reference_type update = reducer.init( reinterpret_cast(data.pool_reduce_local())); std::pair range(0, 0); @@ -1272,14 +1183,13 @@ class ParallelReduce, range = is_dynamic ? data.get_work_stealing_chunk() : data.get_work_partition(); - ParallelReduce::template exec_team(m_functor, data, update, - range.first, range.second, - m_policy.league_size()); + ParallelReduce::template exec_team( + m_functor_reducer.get_functor(), data, update, range.first, + range.second, m_policy.league_size()); } while (is_dynamic && 0 <= range.first); } else { - final_reducer.init( - reinterpret_cast(data.pool_reduce_local())); + reducer.init(reinterpret_cast(data.pool_reduce_local())); } data.disband_team(); @@ -1301,16 +1211,15 @@ class ParallelReduce, pointer_type(m_instance->get_thread_data(0)->pool_reduce_local()); for (int i = 1; i < pool_size; ++i) { - final_reducer.join( - ptr, reinterpret_cast( - m_instance->get_thread_data(i)->pool_reduce_local())); + reducer.join(ptr, + reinterpret_cast( + m_instance->get_thread_data(i)->pool_reduce_local())); } - final_reducer.final(ptr); + reducer.final(ptr); if (m_result_ptr) { - const int n = Analysis::value_count( - ReducerConditional::select(m_functor, m_reducer)); + const int n = reducer.value_count(); for (int j = 0; j < n; ++j) { m_result_ptr[j] = ptr[j]; @@ -1323,20 +1232,16 @@ class ParallelReduce, //---------------------------------------- template - inline ParallelReduce( - const FunctorType& arg_functor, const Policy& arg_policy, - const ViewType& arg_result, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = nullptr) + inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) : m_instance(nullptr), - m_functor(arg_functor), + m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result.data()), - m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, arg_policy.team_size())) { + m_shmem_size( + arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize::value( + arg_functor_reducer.get_functor(), arg_policy.team_size())) { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 if (t_openmp_instance) { m_instance = t_openmp_instance; @@ -1346,31 +1251,12 @@ class ParallelReduce, #else m_instance = arg_policy.space().impl_internal_space_instance(); #endif - } - inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, - const ReducerType& reducer) - : m_instance(nullptr), - m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, arg_policy.team_size())) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else - m_instance = arg_policy.space().impl_internal_space_instance(); -#endif - /*static_assert( std::is_same< typename ViewType::memory_space - , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" - );*/ + static_assert( + Kokkos::Impl::MemorySpaceAccess::accessible, + "Kokkos::OpenMP reduce result must be a View accessible from " + "HostSpace"); } }; From ee2ddaec0c1ba0db568581dd14e61dd1026bd3fc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Mar 2023 12:33:54 -0500 Subject: [PATCH 283/496] Drop KOKKOS_ENABLE_RFO_PREFETCH macro --- core/src/Kokkos_Macros.hpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 901f84367b..a884c037b3 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -224,10 +224,6 @@ #endif #endif -#if defined(KOKKOS_ARCH_AVX512MIC) -#define KOKKOS_ENABLE_RFO_PREFETCH 1 -#endif - #if defined(__MIC__) // Compiling for Xeon Phi #endif @@ -269,10 +265,6 @@ //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 -#if defined(KOKKOS_ARCH_AVX512MIC) -#define KOKKOS_ENABLE_RFO_PREFETCH 1 -#endif - #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ inline __attribute__((always_inline)) From 554032e781e602d1c80f110ad68c642772c8bc62 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Mar 2023 15:44:45 -0500 Subject: [PATCH 284/496] Desul atomics: prefer __CLANG_RDC__ macro --- tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp | 8 ++++---- tpls/desul/src/Lock_Array_HIP.cpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp index 9290aea2b3..53144fbc4c 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp @@ -63,12 +63,12 @@ void finalize_lock_arrays_hip(); * will use it. That is the purpose of the * ensure_hip_lock_arrays_on_device function. */ -#ifdef DESUL_HIP_RDC +#ifdef __CLANG_RDC__ extern #endif __device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE; -#ifdef DESUL_HIP_RDC +#ifdef __CLANG_RDC__ extern #endif __device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE; @@ -120,7 +120,7 @@ namespace { static int lock_array_copied = 0; } // namespace -#ifdef DESUL_HIP_RDC +#ifdef __CLANG_RDC__ inline #else inline static @@ -139,7 +139,7 @@ inline static } } // namespace Impl -#if defined(DESUL_HIP_RDC) +#if defined(__CLANG_RDC__) inline void ensure_hip_lock_arrays_on_device() {} #else static inline void ensure_hip_lock_arrays_on_device() { diff --git a/tpls/desul/src/Lock_Array_HIP.cpp b/tpls/desul/src/Lock_Array_HIP.cpp index 6191fe81e2..0611c85aeb 100644 --- a/tpls/desul/src/Lock_Array_HIP.cpp +++ b/tpls/desul/src/Lock_Array_HIP.cpp @@ -11,7 +11,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #include #include -#ifdef DESUL_HIP_RDC +#ifdef __CLANG_RDC__ namespace desul { namespace Impl { __device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE = nullptr; @@ -87,7 +87,7 @@ void finalize_lock_arrays_hip() { check_error_and_throw_hip(error_free2, "finalize_lock_arrays_hip: free host locks"); HIP_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; -#ifdef DESUL_HIP_RDC +#ifdef __CLANG_RDC__ copy_hip_lock_arrays_to_device(); #endif } From 659baf67da4be8fd2bcf8511b6908e6873d23ed1 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Mar 2023 15:45:14 -0500 Subject: [PATCH 285/496] Drop DESUL_HIP_RDC compile definition --- cmake/kokkos_arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 2187f99352..5cf7fbd373 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -512,7 +512,7 @@ ENDIF() IF (KOKKOS_ENABLE_HIP) IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) COMPILER_SPECIFIC_FLAGS( - DEFAULT -fgpu-rdc -DDESUL_HIP_RDC + DEFAULT -fgpu-rdc ) ELSE() COMPILER_SPECIFIC_FLAGS( From 79f81443a06e188456027ee7b4a19c38269bbad9 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 2 Mar 2023 23:41:51 +0100 Subject: [PATCH 286/496] Fix reducer result check for Serial+HPX ParallelReduce --- core/src/HPX/Kokkos_HPX.hpp | 21 ++++++++++++++++--- .../Serial/Kokkos_Serial_Parallel_MDRange.hpp | 3 ++- .../Serial/Kokkos_Serial_Parallel_Range.hpp | 3 ++- .../Serial/Kokkos_Serial_Parallel_Team.hpp | 2 +- 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp index 53b4ac1da3..0cb6bf245a 100644 --- a/core/src/HPX/Kokkos_HPX.hpp +++ b/core/src/HPX/Kokkos_HPX.hpp @@ -1089,7 +1089,12 @@ class ParallelReduce, : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), m_result_ptr(arg_view.data()), - m_force_synchronous(!arg_view.impl_track().has_record()) {} + m_force_synchronous(!arg_view.impl_track().has_record()) { + static_assert( + Kokkos::Impl::MemorySpaceAccess::accessible, + "HPX reduce result must be a View accessible from HostSpace"); + } }; template @@ -1183,7 +1188,12 @@ class ParallelReduce::accessible, + "HPX reduce result must be a View accessible from HostSpace"); + } template static int max_tile_size_product(const Policy &, const Functor &) { @@ -1586,7 +1596,12 @@ class ParallelReduce::value( m_functor_reducer.get_functor(), arg_policy.team_size())), - m_force_synchronous(!arg_result.impl_track().has_record()) {} + m_force_synchronous(!arg_result.impl_track().has_record()) { + static_assert( + Kokkos::Impl::MemorySpaceAccess::accessible, + "HPX reduce result must be a View accessible from HostSpace"); + } }; } // namespace Impl } // namespace Kokkos diff --git a/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index 0d71437060..16978d7895 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -138,7 +138,8 @@ class ParallelReduce::accessible, - "Kokkos::Serial reduce result must be a View in HostSpace"); + "Kokkos::Serial reduce result must be a View accessible from " + "HostSpace"); } }; diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 01089677a2..3cfa4d781d 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -135,7 +135,8 @@ class ParallelReduce, static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - "Kokkos::Serial reduce result must be a View in HostSpace"); + "Kokkos::Serial reduce result must be a View accessible from " + "HostSpace"); } }; diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index 8b9a9349ab..0876f1af22 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -355,7 +355,7 @@ class ParallelReduce::accessible, - "Reduction result on Kokkos::Serial must be a Kokkos::View in " + "Kokkos::Serial reduce result must be a View accessible from " "HostSpace"); } }; From 1bfd0cc686e6776176d245b24e89d2ce4f6725cd Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 15 Feb 2023 13:48:37 -0500 Subject: [PATCH 287/496] Convert Threads ParallelReduce implementations --- core/src/Kokkos_Parallel_Reduce.hpp | 5 ++ .../Kokkos_Threads_Parallel_MDRange.hpp | 66 ++++----------- .../Threads/Kokkos_Threads_Parallel_Range.hpp | 78 +++++------------ .../Threads/Kokkos_Threads_Parallel_Team.hpp | 84 ++++++------------- 4 files changed, 73 insertions(+), 160 deletions(-) diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index 86951d0797..90dc83cb82 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -1432,6 +1432,11 @@ template <> struct implements_new_reduce_interface : std::true_type {}; #endif +#ifdef KOKKOS_ENABLE_THREADS +template <> +struct implements_new_reduce_interface : std::true_type {}; +#endif + #ifdef KOKKOS_ENABLE_CUDA template <> struct implements_new_reduce_interface : std::true_type {}; diff --git a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp b/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp index 90c46248b8..e4bd6faef5 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp @@ -111,30 +111,22 @@ class ParallelFor, } }; -template -class ParallelReduce, ReducerType, - Kokkos::Threads> { +template +class ParallelReduce, Kokkos::Threads> { private: using MDRangePolicy = Kokkos::MDRangePolicy; using Policy = typename MDRangePolicy::impl_range_policy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; using WorkTag = typename MDRangePolicy::work_tag; using WorkRange = typename Policy::WorkRange; using Member = typename Policy::member_type; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - typename Kokkos::Impl::if_c::value, - WorkTag, void>::type; - - using Analysis = Impl::FunctorAnalysis; - using pointer_type = typename Analysis::pointer_type; - using value_type = typename Analysis::value_type; - using reference_type = typename Analysis::reference_type; + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; using iterate_type = typename Kokkos::Impl::HostIterateTile, ReducerType, const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), exec.pool_size()); - typename Analysis::Reducer reducer( - ReducerConditional::select(self.m_iter.m_func, self.m_reducer)); - self.exec_range( range.begin(), range.end(), - reducer.init(static_cast(exec.reduce_memory()))); + self.m_reducer.init(static_cast(exec.reduce_memory()))); - exec.fan_in_reduce(reducer); + exec.fan_in_reduce(self.m_reducer); } template @@ -188,11 +177,9 @@ class ParallelReduce, ReducerType, exec.barrier(); long work_index = exec.get_work_index(); - typename Analysis::Reducer reducer( - ReducerConditional::select(self.m_iter.m_func, self.m_reducer)); reference_type update = - reducer.init(static_cast(exec.reduce_memory())); + self.m_reducer.init(static_cast(exec.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; @@ -200,14 +187,12 @@ class ParallelReduce, ReducerType, work_index = exec.get_work_index(); } - exec.fan_in_reduce(reducer); + exec.fan_in_reduce(self.m_reducer); } public: inline void execute() const { - ThreadsExec::resize_scratch(Analysis::value_size(ReducerConditional::select( - m_iter.m_func, m_reducer)), - 0); + ThreadsExec::resize_scratch(m_reducer.value_size(), 0); ThreadsExec::start(&ParallelReduce::exec, this); @@ -217,8 +202,7 @@ class ParallelReduce, ReducerType, const pointer_type data = (pointer_type)ThreadsExec::root_reduce_scratch(); - const unsigned n = Analysis::value_count( - ReducerConditional::select(m_iter.m_func, m_reducer)); + const unsigned n = m_reducer.value_count(); for (unsigned i = 0; i < n; ++i) { m_result_ptr[i] = data[i]; } @@ -226,14 +210,11 @@ class ParallelReduce, ReducerType, } template - ParallelReduce(const FunctorType &arg_functor, + ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer, const MDRangePolicy &arg_policy, - const HostViewType &arg_result_view, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void *> = nullptr) - : m_iter(arg_policy, arg_functor), - m_reducer(InvalidType()), + const HostViewType &arg_result_view) + : m_iter(arg_policy, arg_functor_reducer.get_functor()), + m_reducer(arg_functor_reducer.get_reducer()), m_result_ptr(arg_result_view.data()) { static_assert(Kokkos::is_view::value, "Kokkos::Threads reduce result must be a View"); @@ -243,17 +224,6 @@ class ParallelReduce, ReducerType, "Kokkos::Threads reduce result must be a View in HostSpace"); } - inline ParallelReduce(const FunctorType &arg_functor, - MDRangePolicy arg_policy, const ReducerType &reducer) - : m_iter(arg_policy, arg_functor), - m_reducer(reducer), - m_result_ptr(reducer.view().data()) { - /*static_assert( std::is_same< typename ViewType::memory_space - , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" - );*/ - } - template static int max_tile_size_product(const Policy &, const Functor &) { /** diff --git a/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp b/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp index c8d28e8fe3..cb089eaaab 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp @@ -116,33 +116,23 @@ class ParallelFor, : m_functor(arg_functor), m_policy(arg_policy) {} }; -template -class ParallelReduce, ReducerType, +template +class ParallelReduce, Kokkos::Threads> { private: - using Policy = Kokkos::RangePolicy; + using Policy = Kokkos::RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; using WorkTag = typename Policy::work_tag; using WorkRange = typename Policy::WorkRange; using Member = typename Policy::member_type; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - typename Kokkos::Impl::if_c::value, - WorkTag, void>::type; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; - using Analysis = Impl::FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; template @@ -182,11 +172,10 @@ class ParallelReduce, ReducerType, const ParallelReduce &self = *((const ParallelReduce *)arg); const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); - typename Analysis::Reducer reducer( - ReducerConditional::select(self.m_functor, self.m_reducer)); + const ReducerType &reducer = self.m_functor_reducer.get_reducer(); ParallelReduce::template exec_range( - self.m_functor, range.begin(), range.end(), + self.m_functor_reducer.get_functor(), range.begin(), range.end(), reducer.init(static_cast(exec.reduce_memory()))); exec.fan_in_reduce(reducer); @@ -204,9 +193,8 @@ class ParallelReduce, ReducerType, exec.reset_steal_target(); exec.barrier(); - long work_index = exec.get_work_index(); - typename Analysis::Reducer reducer( - ReducerConditional::select(self.m_functor, self.m_reducer)); + long work_index = exec.get_work_index(); + const ReducerType &reducer = self.m_functor_reducer.get_reducer(); reference_type update = reducer.init(static_cast(exec.reduce_memory())); @@ -218,8 +206,8 @@ class ParallelReduce, ReducerType, begin + self.m_policy.chunk_size() < self.m_policy.end() ? begin + self.m_policy.chunk_size() : self.m_policy.end(); - ParallelReduce::template exec_range(self.m_functor, begin, end, - update); + ParallelReduce::template exec_range( + self.m_functor_reducer.get_functor(), begin, end, update); work_index = exec.get_work_index(); } @@ -228,18 +216,15 @@ class ParallelReduce, ReducerType, public: inline void execute() const { + const ReducerType &reducer = m_functor_reducer.get_reducer(); + if (m_policy.end() <= m_policy.begin()) { if (m_result_ptr) { - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); - final_reducer.init(m_result_ptr); - final_reducer.final(m_result_ptr); + reducer.init(m_result_ptr); + reducer.final(m_result_ptr); } } else { - ThreadsExec::resize_scratch( - Analysis::value_size( - ReducerConditional::select(m_functor, m_reducer)), - 0); + ThreadsExec::resize_scratch(reducer.value_size(), 0); ThreadsExec::start(&ParallelReduce::exec, this); @@ -249,8 +234,7 @@ class ParallelReduce, ReducerType, const pointer_type data = (pointer_type)ThreadsExec::root_reduce_scratch(); - const unsigned n = Analysis::value_count( - ReducerConditional::select(m_functor, m_reducer)); + const unsigned n = m_functor_reducer.get_reducer().value_count(); for (unsigned i = 0; i < n; ++i) { m_result_ptr[i] = data[i]; } @@ -259,14 +243,10 @@ class ParallelReduce, ReducerType, } template - ParallelReduce(const FunctorType &arg_functor, const Policy &arg_policy, - const HostViewType &arg_result_view, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void *> = nullptr) - : m_functor(arg_functor), + ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer, + const Policy &arg_policy, const HostViewType &arg_result_view) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result_view.data()) { static_assert(Kokkos::is_view::value, "Kokkos::Threads reduce result must be a View"); @@ -275,18 +255,6 @@ class ParallelReduce, ReducerType, std::is_same::value, "Kokkos::Threads reduce result must be a View in HostSpace"); } - - inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy, - const ReducerType &reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()) { - /*static_assert( std::is_same< typename ViewType::memory_space - , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" - );*/ - } }; template diff --git a/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp b/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp index 28ee3551ef..3a4668f84b 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp @@ -112,31 +112,22 @@ class ParallelFor, arg_functor, m_policy.team_size())) {} }; -template -class ParallelReduce, - ReducerType, Kokkos::Threads> { +template +class ParallelReduce, Kokkos::Threads> { private: using Policy = Kokkos::Impl::TeamPolicyInternal; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using WorkTagFwd = - typename Kokkos::Impl::if_c::value, - WorkTag, void>::type; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; - using Analysis = Impl::FunctorAnalysis; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; const size_t m_shared; @@ -160,29 +151,27 @@ class ParallelReduce, static void exec(ThreadsExec &exec, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - typename Analysis::Reducer reducer( - ReducerConditional::select(self.m_functor, self.m_reducer)); - ParallelReduce::template exec_team( - self.m_functor, Member(&exec, self.m_policy, self.m_shared), - reducer.init(static_cast(exec.reduce_memory()))); + self.m_functor_reducer.get_functor(), + Member(&exec, self.m_policy, self.m_shared), + self.m_functor_reducer.get_reducer().init( + static_cast(exec.reduce_memory()))); - exec.fan_in_reduce(reducer); + exec.fan_in_reduce(self.m_functor_reducer.get_reducer()); } public: inline void execute() const { + const ReducerType &reducer = m_functor_reducer.get_reducer(); + if (m_policy.league_size() * m_policy.team_size() == 0) { if (m_result_ptr) { - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); - final_reducer.init(m_result_ptr); - final_reducer.final(m_result_ptr); + reducer.init(m_result_ptr); + reducer.final(m_result_ptr); } } else { ThreadsExec::resize_scratch( - Analysis::value_size( - ReducerConditional::select(m_functor, m_reducer)), + reducer.value_size(), Policy::member_type::team_reduce_size() + m_shared); ThreadsExec::start(&ParallelReduce::exec, this); @@ -193,8 +182,7 @@ class ParallelReduce, const pointer_type data = (pointer_type)ThreadsExec::root_reduce_scratch(); - const unsigned n = Analysis::value_count( - ReducerConditional::select(m_functor, m_reducer)); + const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { m_result_ptr[i] = data[i]; } @@ -209,39 +197,21 @@ class ParallelReduce, } if (policy.team_size() < 0) { policy.impl_set_team_size(policy.team_size_recommended( - m_functor, m_reducer, ParallelReduceTag{})); + m_functor_reducer.get_functor(), m_functor_reducer.get_reducer(), + ParallelReduceTag{})); } return policy; } template - inline ParallelReduce( - const FunctorType &arg_functor, const Policy &arg_policy, - const ViewType &arg_result, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void *> = nullptr) - : m_functor(arg_functor), + inline ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer, + const Policy &arg_policy, const ViewType &arg_result) + : m_functor_reducer(arg_functor_reducer), m_policy(fix_policy(arg_policy)), - m_reducer(InvalidType()), m_result_ptr(arg_result.data()), m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) + FunctorTeamShmemSize::value( - arg_functor, m_policy.team_size())) {} - - inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy, - const ReducerType &reducer) - : m_functor(arg_functor), - m_policy(fix_policy(arg_policy)), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, m_policy.team_size())) { - /*static_assert( std::is_same< typename ViewType::memory_space - , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" - );*/ + arg_functor_reducer.get_functor(), m_policy.team_size())) { } }; From 9a33347f3826c4ed7a889c0a6dcb693f0871cb38 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 28 Feb 2023 18:08:19 +0100 Subject: [PATCH 288/496] Use local "reducer" variable Co-authored-by: Thomas Conrad Clevenger --- core/src/Threads/Kokkos_Threads_Parallel_Range.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp b/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp index cb089eaaab..662bb8a013 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp @@ -234,7 +234,7 @@ class ParallelReduce, const pointer_type data = (pointer_type)ThreadsExec::root_reduce_scratch(); - const unsigned n = m_functor_reducer.get_reducer().value_count(); + const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { m_result_ptr[i] = data[i]; } From 7b598eb1b768e454da9b95f158034e191f583951 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 2 Mar 2023 23:40:09 +0100 Subject: [PATCH 289/496] Fix reducer result check for Threads ParallelReduce --- core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp | 6 ++++-- core/src/Threads/Kokkos_Threads_Parallel_Range.hpp | 12 +++++++----- core/src/Threads/Kokkos_Threads_Parallel_Team.hpp | 5 +++++ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp b/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp index e4bd6faef5..29eceb36ea 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp @@ -220,8 +220,10 @@ class ParallelReduce::value, - "Kokkos::Threads reduce result must be a View in HostSpace"); + Kokkos::Impl::MemorySpaceAccess::accessible, + "Kokkos::Threads reduce result must be a View accessible from " + "HostSpace"); } template diff --git a/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp b/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp index 662bb8a013..7aefe4f13f 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp @@ -242,18 +242,20 @@ class ParallelReduce, } } - template + template ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer, - const Policy &arg_policy, const HostViewType &arg_result_view) + const Policy &arg_policy, const ViewType &arg_result_view) : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), m_result_ptr(arg_result_view.data()) { - static_assert(Kokkos::is_view::value, + static_assert(Kokkos::is_view::value, "Kokkos::Threads reduce result must be a View"); static_assert( - std::is_same::value, - "Kokkos::Threads reduce result must be a View in HostSpace"); + Kokkos::Impl::MemorySpaceAccess::accessible, + "Kokkos::Threads reduce result must be a View accessible from " + "HostSpace"); } }; diff --git a/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp b/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp index 3a4668f84b..28efba5ed2 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp @@ -212,6 +212,11 @@ class ParallelReduce::value( arg_functor_reducer.get_functor(), m_policy.team_size())) { + static_assert( + Kokkos::Impl::MemorySpaceAccess::accessible, + "Kokkos::Threads reduce result must be a View accessible from " + "HostSpace"); } }; From 65a6f9a3e219c1ead2bdd5de668ecd93e68b333b Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 2 Mar 2023 23:52:05 +0100 Subject: [PATCH 290/496] Add comments testing for non-device-callable destructors --- core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp | 1 + core/unit_test/incremental/Test16_ParallelScan.hpp | 1 + 2 files changed, 2 insertions(+) diff --git a/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp b/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp index 4235c73c8e..0f56c395a9 100644 --- a/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp +++ b/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp @@ -45,6 +45,7 @@ struct NonTrivialReduceFunctor { NonTrivialReduceFunctor(NonTrivialReduceFunctor &&) = default; NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor &&) = default; NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor const &) = default; + // Also make sure that it's OK if the destructor is not device-callable. ~NonTrivialReduceFunctor() {} }; diff --git a/core/unit_test/incremental/Test16_ParallelScan.hpp b/core/unit_test/incremental/Test16_ParallelScan.hpp index 0b9d16f79d..1b8649ce47 100644 --- a/core/unit_test/incremental/Test16_ParallelScan.hpp +++ b/core/unit_test/incremental/Test16_ParallelScan.hpp @@ -56,6 +56,7 @@ struct NonTrivialScanFunctor { NonTrivialScanFunctor(NonTrivialScanFunctor &&) = default; NonTrivialScanFunctor &operator=(NonTrivialScanFunctor &&) = default; NonTrivialScanFunctor &operator=(NonTrivialScanFunctor const &) = default; + // Also make sure that it's OK if the destructor is not device-callable. ~NonTrivialScanFunctor() {} }; From 5d3bcb1d7cd048bd5ec9a3d302e84845d1379a94 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Mar 2023 18:42:05 -0500 Subject: [PATCH 291/496] Define KOKKOS_ARCH_NVIDIA_GPU macro when targeting an NVIDIA GPU architecture --- Makefile.kokkos | 2 ++ cmake/KokkosCore_config.h.in | 1 + cmake/kokkos_arch.cmake | 1 + 3 files changed, 4 insertions(+) diff --git a/Makefile.kokkos b/Makefile.kokkos index a885a640ab..11358b32f4 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1055,6 +1055,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) endif ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NVIDIA_GPU") + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index c8257d8664..0b5f44b39d 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -108,6 +108,7 @@ #cmakedefine KOKKOS_ARCH_AMPERE86 #cmakedefine KOKKOS_ARCH_HOPPER #cmakedefine KOKKOS_ARCH_HOPPER90 +#cmakedefine KOKKOS_ARCH_NVIDIA_GPU #cmakedefine KOKKOS_ARCH_AMD_ZEN #cmakedefine KOKKOS_ARCH_AMD_ZEN2 #cmakedefine KOKKOS_ARCH_AMD_ZEN3 diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 2187f99352..4e69092428 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -566,6 +566,7 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) IF(CUDA_ARCH_ALREADY_SPECIFIED) MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") ENDIF() + SET(KOKKOS_ARCH_NVIDIA_GPU ON PARENT_SCOPE) SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL AND NOT KOKKOS_ENABLE_OPENACC) MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") From f967fa921561cc77bf3d7196d2d0731e4902cc1d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 3 Mar 2023 08:00:36 +0100 Subject: [PATCH 292/496] Provide another constructor in Test16_ParallelScan --- core/unit_test/incremental/Test16_ParallelScan.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/unit_test/incremental/Test16_ParallelScan.hpp b/core/unit_test/incremental/Test16_ParallelScan.hpp index 1b8649ce47..7fc74f9ce7 100644 --- a/core/unit_test/incremental/Test16_ParallelScan.hpp +++ b/core/unit_test/incremental/Test16_ParallelScan.hpp @@ -51,7 +51,8 @@ struct NonTrivialScanFunctor { update_value += val_i; } - NonTrivialScanFunctor() = default; + NonTrivialScanFunctor(const Kokkos::View &data) + : d_data(data) {} NonTrivialScanFunctor(NonTrivialScanFunctor const &) = default; NonTrivialScanFunctor(NonTrivialScanFunctor &&) = default; NonTrivialScanFunctor &operator=(NonTrivialScanFunctor &&) = default; From 72271273ac4d05c0642d42e6d7f075233ecca70e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 21 Feb 2023 13:24:38 -0500 Subject: [PATCH 293/496] Convert OpenACC ParallelReduce --- core/src/Kokkos_Parallel_Reduce.hpp | 6 ++ .../Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 53 +++++++---------- .../Kokkos_OpenACC_ParallelReduce_Range.hpp | 57 ++++++++----------- .../Kokkos_OpenACC_ParallelReduce_Team.hpp | 50 +++++++--------- 4 files changed, 69 insertions(+), 97 deletions(-) diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index 86951d0797..b5b25c3009 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -1442,6 +1442,12 @@ template <> struct implements_new_reduce_interface : std::true_type {}; #endif +#ifdef KOKKOS_ENABLE_OPENACC +template <> +struct implements_new_reduce_interface + : std::true_type {}; +#endif + template class ParallelReduceWrapper { diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index f36b95665c..121a2cfe3f 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -37,40 +37,27 @@ struct OpenACCParallelReduceMDRangeHelper { }; } // namespace Kokkos::Experimental::Impl -template -class Kokkos::Impl::ParallelReduce, - ReducerType, Kokkos::Experimental::OpenACC> { - using Policy = MDRangePolicy; +template +class Kokkos::Impl::ParallelReduce, + Kokkos::Experimental::OpenACC> { + using Policy = MDRangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using ReducerConditional = - if_c, Functor, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using Analysis = - FunctorAnalysis; + using Pointer = typename ReducerType::pointer_type; + using ValueType = typename ReducerType::value_type; - using Pointer = typename Analysis::pointer_type; - using ValueType = typename Analysis::value_type; - - Functor m_functor; + CombinedFunctorReducerType m_functor_reducer; Policy m_policy; - ReducerType m_reducer; Pointer m_result_ptr; public: - ParallelReduce(Functor const& functor, Policy const& policy, - ReducerType const& reducer) - : m_functor(functor), - m_policy(policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()) {} - template - ParallelReduce( - const Functor& functor, const Policy& policy, const ViewType& result, - std::enable_if_t::value, void*> = nullptr) - : m_functor(functor), + ParallelReduce(const CombinedFunctorReducerType& functor_reducer, + const Policy& policy, const ViewType& result) + : m_functor_reducer(functor_reducer), m_policy(policy), - m_reducer(InvalidType()), m_result_ptr(result.data()) {} void execute() const { @@ -85,16 +72,18 @@ class Kokkos::Impl::ParallelReduce, } ValueType val; - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); - final_reducer.init(&val); + const ReducerType& reducer = m_functor_reducer.get_reducer(); + reducer.init(&val); Kokkos::Experimental::Impl::OpenACCParallelReduceMDRangeHelper( - Kokkos::Experimental::Impl::FunctorAdapter(m_functor), - std::conditional_t, ReducerType, - Sum>(val), + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor_reducer.get_functor()), + std::conditional_t< + std::is_same_v, + Sum, typename ReducerType::functor_type>(val), m_policy); + reducer.final(&val); *m_result_ptr = val; } }; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp index a0b380dbbf..30f4797d83 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp @@ -38,40 +38,27 @@ struct OpenACCParallelReduceHelper { } // namespace Kokkos::Experimental::Impl -template -class Kokkos::Impl::ParallelReduce, - ReducerType, Kokkos::Experimental::OpenACC> { - using Policy = RangePolicy; - - using ReducerConditional = - if_c, Functor, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using Analysis = - FunctorAnalysis; - - using Pointer = typename Analysis::pointer_type; - using ValueType = typename Analysis::value_type; - - Functor m_functor; +template +class Kokkos::Impl::ParallelReduce, + Kokkos::Experimental::OpenACC> { + using Policy = RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + using Pointer = typename ReducerType::pointer_type; + using ValueType = typename ReducerType::value_type; + + CombinedFunctorReducerType m_functor_reducer; Policy m_policy; - ReducerType m_reducer; Pointer m_result_ptr; public: - ParallelReduce(Functor const& functor, Policy const& policy, - ReducerType const& reducer) - : m_functor(functor), - m_policy(policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()) {} - template - ParallelReduce( - const Functor& functor, const Policy& policy, const ViewType& result, - std::enable_if_t::value, void*> = nullptr) - : m_functor(functor), + ParallelReduce(CombinedFunctorReducerType const& functor_reducer, + Policy const& policy, ViewType const& result) + : m_functor_reducer(functor_reducer), m_policy(policy), - m_reducer(InvalidType()), m_result_ptr(result.data()) {} void execute() const { @@ -83,16 +70,18 @@ class Kokkos::Impl::ParallelReduce, } ValueType val; - typename Analysis::Reducer final_reducer( - ReducerConditional::select(m_functor, m_reducer)); - final_reducer.init(&val); + ReducerType const& reducer = m_functor_reducer.get_reducer(); + reducer.init(&val); Kokkos::Experimental::Impl::OpenACCParallelReduceHelper( - Kokkos::Experimental::Impl::FunctorAdapter(m_functor), - std::conditional_t, ReducerType, - Sum>(val), + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor_reducer.get_functor()), + std::conditional_t< + std::is_same_v, + Sum, typename ReducerType::functor_type>(val), m_policy); + reducer.final(&val); *m_result_ptr = val; } }; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index b9576f2ea7..4276f0f167 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -39,26 +39,21 @@ struct OpenACCParallelReduceTeamHelper { } // namespace Kokkos::Experimental::Impl -template -class Kokkos::Impl::ParallelReduce +class Kokkos::Impl::ParallelReduce, - ReducerType, Kokkos::Experimental::OpenACC> { + Kokkos::Experimental::OpenACC> { private: using Policy = TeamPolicyInternal; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using Analysis = - FunctorAnalysis; - using value_type = typename Analysis::value_type; - using pointer_type = typename Analysis::pointer_type; + using value_type = typename ReducerType::value_type; + using pointer_type = typename ReducerType::pointer_type; - FunctorType m_functor; + CombinedFunctorReducerType m_functor_reducer; Policy m_policy; - ReducerType m_reducer; pointer_type m_result_ptr; public: @@ -68,35 +63,28 @@ class Kokkos::Impl::ParallelReduce( - m_functor), - std::conditional_t, ReducerType, - Sum>(tmp), + m_functor_reducer.get_functor()), + std::conditional_t< + std::is_same_v, + Sum, typename ReducerType::functor_type>(tmp), m_policy); + reducer.final(&tmp); + m_result_ptr[0] = tmp; } template - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ViewType& arg_result_view, - std::enable_if_t>* = nullptr) - : m_functor(arg_functor), + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result_view) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result_view.data()) {} - - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ReducerType& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()) {} }; namespace Kokkos { From 4dcb2946de4d2936349acddeeee2102e06d579fa Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 2 Mar 2023 18:43:05 -0500 Subject: [PATCH 294/496] Use KOKKOS_ARCH_NVIDIA_GPU macro in SYCL, OpenACC, and OpenMPTarget backends where appropriate --- core/src/OpenACC/Kokkos_OpenACC_Traits.hpp | 3 +-- core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp | 7 ++----- core/src/SYCL/Kokkos_SYCL.cpp | 5 +---- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 10 ++-------- 4 files changed, 6 insertions(+), 19 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp index c8a6dfec6f..97d34d19a3 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp @@ -22,8 +22,7 @@ namespace Kokkos::Experimental::Impl { struct OpenACC_Traits { -#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ - defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER) +#if defined(KOKKOS_ARCH_NVIDIA_GPU) static constexpr acc_device_t dev_type = acc_device_nvidia; static constexpr bool may_fallback_to_host = false; #else diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 564f299ab5..abe1dad73d 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -93,13 +93,10 @@ void OpenMPTargetInternal::impl_initialize() { // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures // from Pascal and upwards. -#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ - defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ - defined(KOKKOS_ARCH_HOPPER) -#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) +#if defined(KOKKOS_ARCH_NVIDIA_GPU) && defined(KOKKOS_COMPILER_CLANG) && \ + (KOKKOS_COMPILER_CLANG >= 1300) omp_set_num_teams(512); #endif -#endif } int OpenMPTargetInternal::impl_is_initialized() { return m_is_initialized ? 1 : 0; diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index c665631dd6..72facc856b 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -128,10 +128,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) { // If the device id is not specified and there are no GPUs, sidestep Kokkos // device selection and use whatever is available (if no GPU architecture is // specified). -#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \ - !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \ - !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \ - !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER) +#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_NVIDIA_GPU) if (!settings.has_device_id() && gpu_devices.empty()) { Impl::SYCLInternal::singleton().initialize(sycl::device()); Impl::SYCLInternal::m_syclDev = 0; diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp index 59e9a7d515..be9a384c78 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -304,10 +304,7 @@ class TeamPolicyInternal return std::min({ int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. -#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ - defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ - defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ - defined(KOKKOS_ARCH_HOPPER) +#if defined(KOKKOS_ARCH_NVIDIA_GPU) 256, #endif max_threads_for_memory @@ -337,10 +334,7 @@ class TeamPolicyInternal return std::min({ int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. -#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ - defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ - defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ - defined(KOKKOS_ARCH_HOPPER) +#if defined(KOKKOS_ARCH_NVIDIA_GPU) 256, #endif max_threads_for_memory From 6d24bc083b637425a0b92357e34272ae9d81f323 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 21 Feb 2023 15:45:07 -0700 Subject: [PATCH 295/496] Update changelog to 4.0.0 --- CHANGELOG.md | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ab6252ec4..f3f0086267 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,108 @@ # Change Log +## [4.0.0](https://github.com/kokkos/kokkos/tree/4.0.0) (2023-02-21) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.01...4.0.0) + +### Features: +- Allow value types without default constructor in `Kokkos::View` with `Kokkos::WithoutInitializing` [\#5307](https://github.com/kokkos/kokkos/pull/5307) +- `parallel_scan` with `View` as result type. [\#5146](https://github.com/kokkos/kokkos/pull/5146) +- Introduced `SharedSpace`, an alias for a `MemorySpace` that is accessible by every `ExecutionSpace`. The memory is moved and then accessed locally. [\#5289](https://github.com/kokkos/kokkos/pull/5289) +- Introduced `SharedHostPinnedSpace`, an alias for a `MemorySpace` that is accessible by every `ExecutionSpace`. The memory is pinned to the host and accessed via zero-copy access. [\#5405](https://github.com/kokkos/kokkos/pull/5405) +- Groundwork for `MDSpan` integration. [\#4973](https://github.com/kokkos/kokkos/pull/4973) and [\#5304](https://github.com/kokkos/kokkos/pull/5304) +- Introduced MD version of hierarchical parallelism: `TeamThreadMDRange`, `ThreadVectorMDRange` and `TeamVectorMDRange`. [\#5238](https://github.com/kokkos/kokkos/pull/5238) + +### Backend and Architecture Enhancements: + +#### CUDA: +- Allow CUDA PTX forward compatibility [\#3612](https://github.com/kokkos/kokkos/pull/3612) [\#5536](https://github.com/kokkos/kokkos/pull/5536) [\#5527](https://github.com/kokkos/kokkos/pull/5527) +- Add support for NVIDIA Hopper GPU architecture [\#5538](https://github.com/kokkos/kokkos/pull/5538) +- Don't rely on synchronization behavior of default stream in CUDA and HIP [\#5391](https://github.com/kokkos/kokkos/pull/5391) +- Improve CUDA cache config settings [\#5706](https://github.com/kokkos/kokkos/pull/5706) + +#### HIP: + - Move `HIP`, `HIPSpace`, `HIPHostPinnedSpace`, and `HIPManagedSpace` out of the `Experimental` namespace [\#5383](https://github.com/kokkos/kokkos/pull/5383) + - Don't rely on synchronization behavior of default stream in CUDA and HIP [\#5391](https://github.com/kokkos/kokkos/pull/5391) + - Export AMD architecture flag when using Trilinos [\#5528](https://github.com/kokkos/kokkos/pull/5528) + - Fix linking error (see [OLCF issue](https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#olcfdev-1167-kokkos-build-failures-with-prgenv-amd)) when using `amdclang`: [\#5539](https://github.com/kokkos/kokkos/pull/5539) + - Remove support for MI25 and added support for Navi 1030 [\#5522](https://github.com/kokkos/kokkos/pull/5522) + - Fix race condition when using `HSA_XNACK=1` [\#5755](https://github.com/kokkos/kokkos/pull/5755) + - Add parameter to force using GlobalMemory launch mechanism. This can be used when encountering compiler bugs with ROCm 5.3 and 5.4 [\#5796](https://github.com/kokkos/kokkos/pull/5796) + +#### SYCL: +- Delegate choice of workgroup size for `parallel_reduce` with `RangePolicy` to the compiler. [\#5227](https://github.com/kokkos/kokkos/pull/5227) +- SYCL `RangePolicy`: manually specify workgroup size through chunk size [\#4875](https://github.com/kokkos/kokkos/pull/4875) + +#### OpenMPTarget: +- Select the right device [\#5492](https://github.com/kokkos/kokkos/pull/5492) + +#### OpenMP: + - Add `partition_space` [\#5105](https://github.com/kokkos/kokkos/pull/5105) + +### General Enhancements +- Implement `OffsetView` constructor taking `pair`s and `ViewCtorProp` [\#5303](https://github.com/kokkos/kokkos/pull/5303) +- Promote math constants to `Kokkos::numbers` namespace [\#5434](https://github.com/kokkos/kokkos/pull/5434) +- Add overloads of `hypot` math function that take 3 arguments [\#5341](https://github.com/kokkos/kokkos/pull/5341) +- Add `fma` fused multiply-add math function [\#5428](https://github.com/kokkos/kokkos/pull/5428) +- Views using `MemoryTraits::Atomic` don't need `volatile` overloads for the value type anymore. [\#5455](https://github.com/kokkos/kokkos/pull/5455) +- Added `is_team_handle` trait [\#5375](https://github.com/kokkos/kokkos/pull/5375) +- Refactor desul atomics to support compiling CUDA with NVC++ [\#5431](https://github.com/kokkos/kokkos/pull/5431) [\#5497](https://github.com/kokkos/kokkos/pull/5497) [\#5498](https://github.com/kokkos/kokkos/pull/5498) +- Support finding `libquadmath` with native compiler support [\#5286](https://github.com/kokkos/kokkos/pull/5286) +- Add architecture flags for MSVC [\#5673](https://github.com/kokkos/kokkos/pull/5673) +- SIMD backend for ARM NEON [\#5829](https://github.com/kokkos/kokkos/pull/5829) + +### Build System Changes +- Let CMake determine OpenMP flags. [\#4105](https://github.com/kokkos/kokkos/pull/4105) +- Update minimum compiler versions. [\#5323](https://github.com/kokkos/kokkos/pull/5323) +- Makefile and CMake support for C++23 [\#5283](https://github.com/kokkos/kokkos/pull/5283) +- Do not add `-cuda` to the link line with NVHPC compiler when the CUDA backend is not actually enabled [\#5485](https://github.com/kokkos/kokkos/pull/5485) +- Only add `-latomic` in generated GNU makefiles when OpenMPTarget backend is enabled [\#5501](https://github.com/kokkos/kokkos/pull/5501) [\#5537](https://github.com/kokkos/kokkos/pull/5537) (3.7 patch release candidate) +- `Kokkos_ENABLE_CUDA_LAMBDA` now `ON` by default with NVCC [\#5580](https://github.com/kokkos/kokkos/pull/5580) +- Fix enabling of relocatable device code when using CUDA as CMake language [\#5564](https://github.com/kokkos/kokkos/pull/5564) +- Fix cmake configuration with CUDA 12 [\#5691](https://github.com/kokkos/kokkos/pull/5691) + +### Incompatibilities (i.e. breaking changes) +- ***Require C++17*** [\#5277](https://github.com/kokkos/kokkos/pull/5277) +- Turn setting `Kokkos_CXX_STANDARD` into an error [\#5293](https://github.com/kokkos/kokkos/pull/5293) +- Remove all deprecations in Kokkos 3 [\#5297](https://github.com/kokkos/kokkos/pull/5297) +- Remove `KOKKOS_COMPILER_CUDA_VERSION` [\#5430](https://github.com/kokkos/kokkos/pull/5430) +- Drop `reciprocal_overflow_threshold` numeric trait [\#5326](https://github.com/kokkos/kokkos/pull/5326) +- Move `reduction_identity` out of `` into a new `` header [\#5450](https://github.com/kokkos/kokkos/pull/5450) +- Reduction and scan routines will report an error if the `join()` operator they would use takes `volatile`-qualified parameters [\#5409](https://github.com/kokkos/kokkos/pull/5409) +- `ENABLE_CUDA_UVM` is dropped in favor of using `SharedSpace` as `MemorySpace` explicitly [\#5608](https://github.com/kokkos/kokkos/pull/5608) +- Remove Kokkos_ENABLE_CUDA_LDG_INTRINSIC option [\#5623](https://github.com/kokkos/kokkos/pull/5623) +- Don't rely on synchronization behavior of default stream in CUDA and HIP - this potentially will break unintended implicit synchronization with other libraries such as MPI [\#5391](https://github.com/kokkos/kokkos/pull/5391) +- Make ExecutionSpace::concurrency() a non-static member function [\#5655](https://github.com/kokkos/kokkos/pull/5655) and related PRs + +### Deprecations +- Guard against non-public header inclusion [\#5178](https://github.com/kokkos/kokkos/pull/5178) +- Raise deprecation warnings if non empty WorkTag class is used [\#5230](https://github.com/kokkos/kokkos/pull/5230) +- Deprecate `parallel_*` overloads taking the label as trailing argument [\#5141](https://github.com/kokkos/kokkos/pull/5141) +- Deprecate nested types in functional [\#5185](https://github.com/kokkos/kokkos/pull/5185) +- Deprecate `InitArguments` struct and replace it with `InitializationSettings` [\#5135](https://github.com/kokkos/kokkos/pull/5135) +- Deprecate `finalize_all()` [\#5134](https://github.com/kokkos/kokkos/pull/5134) +- Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120) +- Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117) +- Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111) +- Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957) +- Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810) +- Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382) +- Deprecate `CudaUVMSpace::available()` which always returned `true` [\#5614](https://github.com/kokkos/kokkos/pull/5614) +- Deprecate `volatile`-qualified members from `Kokkos::pair` and `Kokkos::complex` [\#5412](https://github.com/kokkos/kokkos/pull/5412) +- Deprecate `KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_*` macros [\#5824](https://github.com/kokkos/kokkos/pull/5824) (oversight in 3.2) + +### Bug Fixes +- Avoid allocating memory for `UniqueToken` [\#5300](https://github.com/kokkos/kokkos/pull/5300) +- Fix `pragma ivdep` in `Kokkos_OpenMP_Parallel.hpp` [\#5356](https://github.com/kokkos/kokkos/pull/5356) +- Fix configuring with Threads support when rerunning CMake [\#5486](https://github.com/kokkos/kokkos/pull/5486) +- Fix View assignment between `LayoutLeft` and `LayoutRight` with static extents [\#5535](https://github.com/kokkos/kokkos/pull/5535) (3.7 patch release candidate) +- Add `fence()` calls to sorting routine overloads that don't take an execution space parameter [\#5389](https://github.com/kokkos/kokkos/pull/5389) +- `ClockTic` changed to 64 bit to fix overflow on Power [\#5577](https://github.com/kokkos/kokkos/pull/5577) (incl. in 3.7.01 patch release) +- Fix incorrect offset in CUDA and HIP `parallel_scan` for < 4 byte types [\#5555](https://github.com/kokkos/kokkos/pull/5555) (3.7 patch release candidate) +- Fix incorrect alignment behavior of scratch allocations in some corner cases (e.g. very small allocations) [\#5687](https://github.com/kokkos/kokkos/pull/5687) (3.7 patch release candidate) +- Add missing `ReductionIdentity` specialization [\#5798](https://github.com/kokkos/kokkos/pull/5798) +- Don't install standard algorithms headers multiple times [\#5670](https://github.com/kokkos/kokkos/pull/5670) +- Fix max scratch size calculation for level 0 scratch in CUDA and HIP [\#5718](https://github.com/kokkos/kokkos/pull/5718) + ## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01) From 952b841a391c115066c64befd8d1dba48225e18c Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 3 Mar 2023 19:02:46 +0100 Subject: [PATCH 296/496] Fix Kokkos_Threads_Parallel_MDRange.hpp --- core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp b/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp index 29eceb36ea..3ba8d27f5c 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp @@ -209,14 +209,14 @@ class ParallelReduce + template ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer, const MDRangePolicy &arg_policy, - const HostViewType &arg_result_view) + const ViewType &arg_result_view) : m_iter(arg_policy, arg_functor_reducer.get_functor()), m_reducer(arg_functor_reducer.get_reducer()), m_result_ptr(arg_result_view.data()) { - static_assert(Kokkos::is_view::value, + static_assert(Kokkos::is_view::value, "Kokkos::Threads reduce result must be a View"); static_assert( From 51fbd42a2135124bb2060f083cfa7fe04a9115d2 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 3 Mar 2023 16:41:33 -0500 Subject: [PATCH 297/496] Drop unused ParallelX::WorkRange member types --- core/src/HPX/Kokkos_HPX.hpp | 20 ++++++++----------- core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp | 20 ++++++++----------- .../Kokkos_OpenMPTarget_ParallelFor_Range.hpp | 7 +++---- ...kkos_OpenMPTarget_ParallelReduce_Range.hpp | 3 +-- ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 7 +++---- core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp | 1 - 6 files changed, 23 insertions(+), 35 deletions(-) diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp index 53b4ac1da3..ee03fbfe58 100644 --- a/core/src/HPX/Kokkos_HPX.hpp +++ b/core/src/HPX/Kokkos_HPX.hpp @@ -908,10 +908,9 @@ template class ParallelFor, Kokkos::Experimental::HPX> { private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; + using Policy = Kokkos::RangePolicy; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; const FunctorType m_functor; const Policy m_policy; @@ -948,7 +947,6 @@ class ParallelFor, using MDRangePolicy = Kokkos::MDRangePolicy; using Policy = typename MDRangePolicy::impl_range_policy; using WorkTag = typename MDRangePolicy::work_tag; - using WorkRange = typename Policy::WorkRange; using Member = typename Policy::member_type; using iterate_type = typename Kokkos::Impl::HostIterateTile, using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; using value_type = typename ReducerType::value_type; using pointer_type = typename ReducerType::pointer_type; @@ -1101,10 +1098,9 @@ class ParallelReduce class ParallelFor, Kokkos::OpenMP> { private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; + using Policy = Kokkos::RangePolicy; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; OpenMPInternal* m_instance; const FunctorType m_functor; @@ -178,8 +177,7 @@ class ParallelFor, using Policy = typename MDRangePolicy::impl_range_policy; using WorkTag = typename MDRangePolicy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; + using Member = typename Policy::member_type; using index_type = typename Policy::index_type; using iterate_type = typename Kokkos::Impl::HostIterateTile< @@ -300,9 +298,8 @@ class ParallelReduce, using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; using pointer_type = typename ReducerType::pointer_type; using reference_type = typename ReducerType::reference_type; @@ -463,9 +460,8 @@ class ParallelReduce class ParallelFor, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; + using Policy = Kokkos::RangePolicy; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; const FunctorType m_functor; const Policy m_policy; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 1a574f16be..9153402596 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -31,8 +31,7 @@ class ParallelReduce, ReducerType, private: using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; + using WorkTag = typename Policy::work_tag; using ReducerTypeFwd = std::conditional_t::value, diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index 61566b75c9..1900260e2a 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -30,10 +30,9 @@ class ParallelScan, protected: using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; - using idx_type = typename Policy::index_type; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using idx_type = typename Policy::index_type; using Analysis = Impl::FunctorAnalysis; diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp index bc62ecc452..4b7964729a 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp @@ -97,7 +97,6 @@ class ParallelScanSYCLBase { protected: using Member = typename Policy::member_type; using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; using LaunchBounds = typename Policy::launch_bounds; public: From 90836d29d96dfb5be25aef90109e350dbb86b11b Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 6 Mar 2023 10:10:54 -0500 Subject: [PATCH 298/496] Convert SYCL ParallelReduce --- core/src/Kokkos_Parallel_Reduce.hpp | 6 + core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp | 221 +++++++----------- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 141 +++++------ 3 files changed, 144 insertions(+), 224 deletions(-) diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index b5b25c3009..c31c3bde92 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -1448,6 +1448,12 @@ struct implements_new_reduce_interface : std::true_type {}; #endif +#ifdef KOKKOS_ENABLE_SYCL +template <> +struct implements_new_reduce_interface + : std::true_type {}; +#endif + template class ParallelReduceWrapper { diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp index 77a3e71d06..1a3350cedc 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp @@ -179,50 +179,32 @@ std::enable_if_t> workgroup_reduction( } // namespace SYCLReduction -template -class ParallelReduce, ReducerType, +template +class ParallelReduce, Kokkos::Experimental::SYCL> { public: - using Policy = Kokkos::RangePolicy; + using Policy = Kokkos::RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; private: - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using Analysis = - FunctorAnalysis; - using execution_space = typename Analysis::execution_space; - using value_type = typename Analysis::value_type; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; + using value_type = typename ReducerType::value_type; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; using WorkTag = typename Policy::work_tag; public: // V - View - template - ParallelReduce(const FunctorType& f, const Policy& p, const V& v, - std::enable_if_t::value, void*> = nullptr) - : m_functor(f), + template + ParallelReduce(const CombinedFunctorReducerType& f, const Policy& p, + const View& v) + : m_functor_reducer(f), m_policy(p), m_result_ptr(v.data()), m_result_ptr_device_accessible( MemorySpaceAccess::accessible), - m_shared_memory_lock( - p.space().impl_internal_space_instance()->m_mutexScratchSpace) {} - - ParallelReduce(const FunctorType& f, const Policy& p, - const ReducerType& reducer) - : m_functor(f), - m_policy(p), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), + typename View::memory_space>::accessible), m_shared_memory_lock( p.space().impl_internal_space_instance()->m_mutexScratchSpace) {} @@ -242,7 +224,7 @@ class ParallelReduce, ReducerType, constexpr size_t values_per_thread = 2; std::size_t size = policy.end() - policy.begin(); const unsigned int value_count = - Analysis::value_count(ReducerConditional::select(m_functor, m_reducer)); + m_functor_reducer.get_reducer().value_count(); sycl::device_ptr results_ptr = nullptr; sycl::global_ptr device_accessible_result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr; @@ -261,22 +243,18 @@ class ParallelReduce, ReducerType, const auto begin = policy.begin(); cgh.depends_on(memcpy_events); cgh.single_task([=]() { - const auto& functor = functor_wrapper.get_functor(); - const auto& selected_reducer = ReducerConditional::select( - static_cast(functor), - static_cast(reducer_wrapper.get_functor())); - typename Analysis::Reducer final_reducer(selected_reducer); - reference_type update = final_reducer.init(results_ptr); + const FunctorType& functor = functor_wrapper.get_functor(); + const ReducerType& reducer = reducer_wrapper.get_functor(); + reference_type update = reducer.init(results_ptr); if (size == 1) { - if constexpr (std::is_void::value) + if constexpr (std::is_void_v) functor(begin, update); else functor(WorkTag(), begin, update); } - final_reducer.final(results_ptr); + reducer.final(results_ptr); if (device_accessible_result_ptr != nullptr) - final_reducer.copy(device_accessible_result_ptr.get(), - results_ptr.get()); + reducer.copy(device_accessible_result_ptr.get(), results_ptr.get()); }); }); q.ext_oneapi_submit_barrier( @@ -306,23 +284,19 @@ class ParallelReduce, ReducerType, const auto global_id = wgroup_size * item.get_group_linear_id() * values_per_thread + local_id; - const auto& functor = functor_wrapper.get_functor(); - const auto& selected_reducer = ReducerConditional::select( - static_cast(functor), - static_cast( - reducer_wrapper.get_functor())); - typename Analysis::Reducer final_reducer(selected_reducer); + const FunctorType& functor = functor_wrapper.get_functor(); + const ReducerType& reducer = reducer_wrapper.get_functor(); using index_type = typename Policy::index_type; const auto upper_bound = std::min( global_id + values_per_thread * wgroup_size, size); - if constexpr (Analysis::StaticValueSize == 0) { + if constexpr (ReducerType::static_value_size() == 0) { reference_type update = - final_reducer.init(&local_mem[local_id * value_count]); + reducer.init(&local_mem[local_id * value_count]); for (index_type id = global_id; id < upper_bound; id += wgroup_size) { - if constexpr (std::is_void::value) + if constexpr (std::is_void_v) functor(id + begin, update); else functor(WorkTag(), id + begin, update); @@ -331,8 +305,8 @@ class ParallelReduce, ReducerType, SYCLReduction::workgroup_reduction<>( item, local_mem.get_pointer(), results_ptr, - device_accessible_result_ptr, value_count, final_reducer, - false, std::min(size, wgroup_size)); + device_accessible_result_ptr, value_count, reducer, false, + std::min(size, wgroup_size)); if (local_id == 0) { sycl::atomic_ref, ReducerType, item.barrier(sycl::access::fence_space::local_space); if (num_teams_done[0] == n_wgroups) { if (local_id >= n_wgroups) - final_reducer.init(&local_mem[local_id * value_count]); + reducer.init(&local_mem[local_id * value_count]); else { - final_reducer.copy(&local_mem[local_id * value_count], - &results_ptr[local_id * value_count]); + reducer.copy(&local_mem[local_id * value_count], + &results_ptr[local_id * value_count]); for (unsigned int id = local_id + wgroup_size; id < n_wgroups; id += wgroup_size) { - final_reducer.join(&local_mem[local_id * value_count], - &results_ptr[id * value_count]); + reducer.join(&local_mem[local_id * value_count], + &results_ptr[id * value_count]); } } SYCLReduction::workgroup_reduction<>( item, local_mem.get_pointer(), results_ptr, - device_accessible_result_ptr, value_count, final_reducer, - true, std::min(n_wgroups, wgroup_size)); + device_accessible_result_ptr, value_count, reducer, true, + std::min(n_wgroups, wgroup_size)); } } else { value_type local_value; - reference_type update = final_reducer.init(&local_value); + reference_type update = reducer.init(&local_value); for (index_type id = global_id; id < upper_bound; id += wgroup_size) { - if constexpr (std::is_void::value) + if constexpr (std::is_void_v) functor(id + begin, update); else functor(WorkTag(), id + begin, update); @@ -373,7 +347,7 @@ class ParallelReduce, ReducerType, SYCLReduction::workgroup_reduction<>( item, local_mem.get_pointer(), local_value, results_ptr, - device_accessible_result_ptr, final_reducer, false, + device_accessible_result_ptr, reducer, false, std::min(size, wgroup_size)); if (local_id == 0) { @@ -386,18 +360,18 @@ class ParallelReduce, ReducerType, item.barrier(sycl::access::fence_space::local_space); if (num_teams_done[0] == n_wgroups) { if (local_id >= n_wgroups) - final_reducer.init(&local_value); + reducer.init(&local_value); else { local_value = results_ptr[local_id]; for (unsigned int id = local_id + wgroup_size; id < n_wgroups; id += wgroup_size) { - final_reducer.join(&local_value, &results_ptr[id]); + reducer.join(&local_value, &results_ptr[id]); } } SYCLReduction::workgroup_reduction<>( item, local_mem.get_pointer(), local_value, results_ptr, - device_accessible_result_ptr, final_reducer, true, + device_accessible_result_ptr, reducer, true, std::min(n_wgroups, wgroup_size)); } } @@ -486,9 +460,9 @@ class ParallelReduce, ReducerType, IndirectKernelMem& indirectReducerMem = instance.get_indirect_kernel_mem(); auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + m_functor_reducer.get_functor(), indirectKernelMem); auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_reducer, indirectReducerMem); + m_functor_reducer.get_reducer(), indirectReducerMem); sycl::event event = sycl_direct_launch( m_policy, functor_wrapper, reducer_wrapper, @@ -498,9 +472,8 @@ class ParallelReduce, ReducerType, } private: - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; @@ -509,23 +482,19 @@ class ParallelReduce, ReducerType, std::scoped_lock m_shared_memory_lock; }; -template -class ParallelReduce, ReducerType, +template +class ParallelReduce, Kokkos::Experimental::SYCL> { public: - using Policy = Kokkos::MDRangePolicy; + using Policy = Kokkos::MDRangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; private: - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using Analysis = - FunctorAnalysis; - using execution_space = typename Analysis::execution_space; - using value_type = typename Analysis::value_type; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; + using value_type = typename ReducerType::value_type; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; using WorkTag = typename Policy::work_tag; @@ -554,30 +523,16 @@ class ParallelReduce, ReducerType, public: // V - View - template - ParallelReduce(const FunctorType& f, const Policy& p, const V& v, - std::enable_if_t::value, void*> = nullptr) - : m_functor(f), + template + ParallelReduce(const CombinedFunctorReducerType& f, const Policy& p, + const View& v) + : m_functor_reducer(f), m_policy(p), m_space(p.space()), m_result_ptr(v.data()), m_result_ptr_device_accessible( MemorySpaceAccess::accessible), - m_shared_memory_lock( - m_space.impl_internal_space_instance()->m_mutexScratchSpace) {} - - ParallelReduce(const FunctorType& f, const Policy& p, - const ReducerType& reducer) - : m_functor(f), - m_policy(p), - m_space(p.space()), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), + typename View::memory_space>::accessible), m_shared_memory_lock( m_space.impl_internal_space_instance()->m_mutexScratchSpace) {} @@ -607,7 +562,7 @@ class ParallelReduce, ReducerType, const auto init_size = std::max((size + wgroup_size - 1) / wgroup_size, 1); const unsigned int value_count = - Analysis::value_count(ReducerConditional::select(m_functor, m_reducer)); + m_functor_reducer.get_reducer().value_count(); const auto results_ptr = static_cast>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u) * init_size)); @@ -625,13 +580,10 @@ class ParallelReduce, ReducerType, auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { cgh.depends_on(memcpy_events); cgh.single_task([=]() { - const auto& functor = functor_wrapper.get_functor(); - const auto& selected_reducer = ReducerConditional::select( - static_cast(functor), - static_cast(reducer_wrapper.get_functor())); - typename Analysis::Reducer final_reducer(selected_reducer); + const FunctorType& functor = functor_wrapper.get_functor(); + const ReducerType& reducer = reducer_wrapper.get_functor(); - reference_type update = final_reducer.init(results_ptr); + reference_type update = reducer.init(results_ptr); if (size == 1) { Kokkos::Impl::Reduce::DeviceIterateTile< Policy::rank, BarePolicy, FunctorType, @@ -639,10 +591,9 @@ class ParallelReduce, ReducerType, policy, functor, update, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}) .exec_range(); } - final_reducer.final(results_ptr); + reducer.final(results_ptr); if (device_accessible_result_ptr) - final_reducer.copy(device_accessible_result_ptr.get(), - results_ptr.get()); + reducer.copy(device_accessible_result_ptr.get(), results_ptr.get()); }); }); q.ext_oneapi_submit_barrier( @@ -666,12 +617,9 @@ class ParallelReduce, ReducerType, cgh.depends_on(memcpy_events); cgh.parallel_for(range, [=](sycl::nd_item<1> item) { - const auto local_id = item.get_local_linear_id(); - const auto& functor = functor_wrapper.get_functor(); - const auto& selected_reducer = ReducerConditional::select( - static_cast(functor), - static_cast(reducer_wrapper.get_functor())); - typename Analysis::Reducer final_reducer(selected_reducer); + const auto local_id = item.get_local_linear_id(); + const FunctorType& functor = functor_wrapper.get_functor(); + const ReducerType& reducer = reducer_wrapper.get_functor(); // In the first iteration, we call functor to initialize the local // memory. Otherwise, the local memory is initialized with the @@ -690,9 +638,9 @@ class ParallelReduce, ReducerType, const index_type n_global_y = 1; const index_type n_global_z = 1; - if constexpr (Analysis::StaticValueSize == 0) { + if constexpr (ReducerType::static_value_size() == 0) { reference_type update = - final_reducer.init(&local_mem[local_id * value_count]); + reducer.init(&local_mem[local_id * value_count]); Kokkos::Impl::Reduce::DeviceIterateTile< Policy::rank, BarePolicy, FunctorType, @@ -705,7 +653,7 @@ class ParallelReduce, ReducerType, SYCLReduction::workgroup_reduction<>( item, local_mem.get_pointer(), results_ptr, - device_accessible_result_ptr, value_count, final_reducer, false, + device_accessible_result_ptr, value_count, reducer, false, std::min(size, wgroup_size)); if (local_id == 0) { @@ -718,25 +666,25 @@ class ParallelReduce, ReducerType, item.barrier(sycl::access::fence_space::local_space); if (num_teams_done[0] == n_wgroups) { if (local_id >= n_wgroups) - final_reducer.init(&local_mem[local_id * value_count]); + reducer.init(&local_mem[local_id * value_count]); else { - final_reducer.copy(&local_mem[local_id * value_count], - &results_ptr[local_id * value_count]); + reducer.copy(&local_mem[local_id * value_count], + &results_ptr[local_id * value_count]); for (unsigned int id = local_id + wgroup_size; id < n_wgroups; id += wgroup_size) { - final_reducer.join(&local_mem[local_id * value_count], - &results_ptr[id * value_count]); + reducer.join(&local_mem[local_id * value_count], + &results_ptr[id * value_count]); } } SYCLReduction::workgroup_reduction<>( item, local_mem.get_pointer(), results_ptr, - device_accessible_result_ptr, value_count, final_reducer, - true, std::min(n_wgroups, wgroup_size)); + device_accessible_result_ptr, value_count, reducer, true, + std::min(n_wgroups, wgroup_size)); } } else { value_type local_value; - reference_type update = final_reducer.init(&local_value); + reference_type update = reducer.init(&local_value); Kokkos::Impl::Reduce::DeviceIterateTile< Policy::rank, BarePolicy, FunctorType, @@ -748,7 +696,7 @@ class ParallelReduce, ReducerType, SYCLReduction::workgroup_reduction<>( item, local_mem.get_pointer(), local_value, results_ptr, - device_accessible_result_ptr, final_reducer, false, + device_accessible_result_ptr, reducer, false, std::min(size, wgroup_size)); if (local_id == 0) { @@ -761,18 +709,18 @@ class ParallelReduce, ReducerType, item.barrier(sycl::access::fence_space::local_space); if (num_teams_done[0] == n_wgroups) { if (local_id >= n_wgroups) - final_reducer.init(&local_value); + reducer.init(&local_value); else { local_value = results_ptr[local_id]; for (unsigned int id = local_id + wgroup_size; id < n_wgroups; id += wgroup_size) { - final_reducer.join(&local_value, &results_ptr[id]); + reducer.join(&local_value, &results_ptr[id]); } } SYCLReduction::workgroup_reduction<>( item, local_mem.get_pointer(), local_value, results_ptr, - device_accessible_result_ptr, final_reducer, true, + device_accessible_result_ptr, reducer, true, std::min(n_wgroups, wgroup_size)); } } @@ -810,9 +758,9 @@ class ParallelReduce, ReducerType, IndirectKernelMem& indirectReducerMem = instance.get_indirect_kernel_mem(); auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + m_functor_reducer.get_functor(), indirectKernelMem); auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_reducer, indirectReducerMem); + m_functor_reducer.get_reducer(), indirectReducerMem); sycl::event event = sycl_direct_launch( m_policy, functor_wrapper, reducer_wrapper, @@ -822,10 +770,9 @@ class ParallelReduce, ReducerType, } private: - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const BarePolicy m_policy; const Kokkos::Experimental::SYCL& m_space; - const ReducerType m_reducer; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp index 59e9a7d515..80f5db0558 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -515,35 +515,31 @@ class ParallelFor, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -template -class ParallelReduce, - ReducerType, Kokkos::Experimental::SYCL> { +template +class ParallelReduce, + Kokkos::Experimental::SYCL> { public: using Policy = TeamPolicyInternal; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; private: - using ReducerConditional = - Kokkos::Impl::if_c::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using Analysis = - FunctorAnalysis; using member_type = typename Policy::member_type; using WorkTag = typename Policy::work_tag; using launch_bounds = typename Policy::launch_bounds; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - using value_type = typename Analysis::value_type; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + using value_type = typename ReducerType::value_type; public: using functor_type = FunctorType; using size_type = Kokkos::Experimental::SYCL::size_type; private: - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; size_type m_shmem_begin; @@ -570,7 +566,7 @@ class ParallelReduce, sycl::queue& q = space.sycl_queue(); const unsigned int value_count = - Analysis::value_count(ReducerConditional::select(m_functor, m_reducer)); + m_functor_reducer.get_reducer().value_count(); std::size_t size = std::size_t(m_league_size) * m_team_size * m_vector_size; value_type* results_ptr = nullptr; @@ -603,27 +599,22 @@ class ParallelReduce, cgh.parallel_for( sycl::nd_range<2>(sycl::range<2>(1, 1), sycl::range<2>(1, 1)), [=](sycl::nd_item<2> item) { - const auto& functor = functor_wrapper.get_functor(); - const auto& selected_reducer = ReducerConditional::select( - static_cast(functor), - static_cast( - reducer_wrapper.get_functor())); - typename Analysis::Reducer final_reducer(selected_reducer); - - reference_type update = final_reducer.init(results_ptr); + const FunctorType& functor = functor_wrapper.get_functor(); + const ReducerType& reducer = reducer_wrapper.get_functor(); + + reference_type update = reducer.init(results_ptr); if (size == 1) { const member_type team_member( team_scratch_memory_L0.get_pointer(), shmem_begin, scratch_size[0], global_scratch_ptr, scratch_size[1], item); - if constexpr (std::is_void::value) + if constexpr (std::is_void_v) functor(team_member, update); else functor(WorkTag(), team_member, update); } - final_reducer.final(results_ptr); + reducer.final(results_ptr); if (device_accessible_result_ptr) - final_reducer.copy(device_accessible_result_ptr, - &results_ptr[0]); + reducer.copy(device_accessible_result_ptr, &results_ptr[0]); }); }); q.ext_oneapi_submit_barrier( @@ -664,23 +655,19 @@ class ParallelReduce, auto& num_teams_done = reinterpret_cast( local_mem[wgroup_size * std::max(value_count, 1u)]); - const auto local_id = item.get_local_linear_id(); - const auto& functor = functor_wrapper.get_functor(); - const auto& selected_reducer = ReducerConditional::select( - static_cast(functor), - static_cast( - reducer_wrapper.get_functor())); - typename Analysis::Reducer final_reducer(selected_reducer); - - if constexpr (Analysis::StaticValueSize == 0) { + const auto local_id = item.get_local_linear_id(); + const FunctorType& functor = functor_wrapper.get_functor(); + const ReducerType& reducer = reducer_wrapper.get_functor(); + + if constexpr (ReducerType::static_value_size() == 0) { reference_type update = - final_reducer.init(&local_mem[local_id * value_count]); + reducer.init(&local_mem[local_id * value_count]); const member_type team_member( team_scratch_memory_L0.get_pointer(), shmem_begin, scratch_size[0], global_scratch_ptr + item.get_group(1) * scratch_size[1], scratch_size[1], item); - if constexpr (std::is_void::value) + if constexpr (std::is_void_v) functor(team_member, update); else functor(WorkTag(), team_member, update); @@ -688,8 +675,7 @@ class ParallelReduce, SYCLReduction::workgroup_reduction<>( item, local_mem.get_pointer(), results_ptr, - device_accessible_result_ptr, value_count, - selected_reducer, false, + device_accessible_result_ptr, value_count, reducer, false, std::min(size, item.get_local_range()[0] * item.get_local_range()[1])); @@ -704,40 +690,40 @@ class ParallelReduce, sycl::group_barrier(item.get_group()); if (num_teams_done == n_wgroups) { if (local_id >= n_wgroups) - final_reducer.init(&local_mem[local_id * value_count]); + reducer.init(&local_mem[local_id * value_count]); else { - final_reducer.copy(&local_mem[local_id * value_count], - &results_ptr[local_id * value_count]); + reducer.copy(&local_mem[local_id * value_count], + &results_ptr[local_id * value_count]); for (unsigned int id = local_id + wgroup_size; id < n_wgroups; id += wgroup_size) { - final_reducer.join(&local_mem[local_id * value_count], - &results_ptr[id * value_count]); + reducer.join(&local_mem[local_id * value_count], + &results_ptr[id * value_count]); } } SYCLReduction::workgroup_reduction<>( item, local_mem.get_pointer(), results_ptr, - device_accessible_result_ptr, value_count, - selected_reducer, true, + device_accessible_result_ptr, value_count, reducer, + true, std::min(n_wgroups, item.get_local_range()[0] * item.get_local_range()[1])); } } else { value_type local_value; - reference_type update = final_reducer.init(&local_value); + reference_type update = reducer.init(&local_value); const member_type team_member( team_scratch_memory_L0.get_pointer(), shmem_begin, scratch_size[0], global_scratch_ptr + item.get_group(1) * scratch_size[1], scratch_size[1], item); - if constexpr (std::is_void::value) + if constexpr (std::is_void_v) functor(team_member, update); else functor(WorkTag(), team_member, update); SYCLReduction::workgroup_reduction<>( item, local_mem.get_pointer(), local_value, results_ptr, - device_accessible_result_ptr, final_reducer, false, + device_accessible_result_ptr, reducer, false, std::min(size, item.get_local_range()[0] * item.get_local_range()[1])); @@ -752,18 +738,18 @@ class ParallelReduce, item.barrier(sycl::access::fence_space::local_space); if (num_teams_done == n_wgroups) { if (local_id >= n_wgroups) - final_reducer.init(&local_value); + reducer.init(&local_value); else { local_value = results_ptr[local_id]; for (unsigned int id = local_id + wgroup_size; id < n_wgroups; id += wgroup_size) { - final_reducer.join(&local_value, &results_ptr[id]); + reducer.join(&local_value, &results_ptr[id]); } } SYCLReduction::workgroup_reduction<>( item, local_mem.get_pointer(), local_value, results_ptr, - device_accessible_result_ptr, final_reducer, true, + device_accessible_result_ptr, reducer, true, std::min(n_wgroups, item.get_local_range()[0] * item.get_local_range()[1])); } @@ -842,9 +828,9 @@ class ParallelReduce, IndirectKernelMem& indirectReducerMem = instance.get_indirect_kernel_mem(); auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + m_functor_reducer.get_functor(), indirectKernelMem); auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_reducer, indirectReducerMem); + m_functor_reducer.get_reducer(), indirectReducerMem); sycl::event event = sycl_direct_launch( m_policy, functor_wrapper, reducer_wrapper, @@ -857,8 +843,9 @@ class ParallelReduce, void initialize() { // FIXME_SYCL optimize if (m_team_size < 0) - m_team_size = - m_policy.team_size_recommended(m_functor, ParallelReduceTag{}); + m_team_size = m_policy.team_size_recommended( + m_functor_reducer.get_functor(), m_functor_reducer.get_reducer(), + ParallelReduceTag{}); // Must be a power of two greater than two, get the one not bigger than the // requested one. if ((m_team_size & m_team_size - 1) || m_team_size < 2) { @@ -867,10 +854,10 @@ class ParallelReduce, m_team_size = temp_team_size; } - m_shmem_begin = (sizeof(double) * (m_team_size + 2)); - m_shmem_size = - (m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value(m_functor, m_team_size)); + m_shmem_begin = (sizeof(double) * (m_team_size + 2)); + m_shmem_size = (m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize::value( + m_functor_reducer.get_functor(), m_team_size)); m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); @@ -891,20 +878,19 @@ class ParallelReduce, Kokkos::Impl::throw_runtime_exception(out.str()); } - if (m_team_size > m_policy.team_size_max(m_functor, ParallelReduceTag{})) + if (m_team_size > m_policy.team_size_max(m_functor_reducer.get_functor(), + m_functor_reducer.get_reducer(), + ParallelReduceTag{})) Kokkos::Impl::throw_runtime_exception( "Kokkos::Impl::ParallelFor requested too large team size."); } public: template - ParallelReduce( - FunctorType const& arg_functor, Policy const& arg_policy, - ViewType const& arg_result, - std::enable_if_t::value, void*> = nullptr) - : m_functor(arg_functor), + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result.data()), m_result_ptr_device_accessible( MemorySpaceAccess, ->m_team_scratch_mutex) { initialize(); } - - ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy, - ReducerType const& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { - initialize(); - } }; } // namespace Impl } // namespace Kokkos From 3c77f6fb751f718b11d37b8834d7a38182fbc229 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 6 Mar 2023 10:11:14 -0500 Subject: [PATCH 299/496] Also convert SYCL ParallelScan --- core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp | 53 +++++++++++++-------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp index 4b7964729a..3bd25b1f23 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp @@ -110,7 +110,8 @@ class ParallelScanSYCLBase { using index_type = typename Policy::index_type; protected: - const FunctorType m_functor; + const CombinedFunctorReducer + m_functor_reducer; const Policy m_policy; pointer_type m_scratch_space = nullptr; const pointer_type m_result_ptr; @@ -143,8 +144,11 @@ class ParallelScanSYCLBase { cgh.parallel_for( sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), [=](sycl::nd_item<1> item) { - const FunctorType& functor = functor_wrapper.get_functor(); - typename Analysis::Reducer final_reducer(functor); + const CombinedFunctorReducer< + FunctorType, typename Analysis::Reducer>& functor_reducer = + functor_wrapper.get_functor(); + const typename Analysis::Reducer& reducer = + functor_reducer.get_reducer(); const auto local_id = item.get_local_linear_id(); const auto global_id = item.get_global_linear_id(); @@ -154,9 +158,9 @@ class ParallelScanSYCLBase { if (global_id < size) local_value = global_mem[global_id]; else - final_reducer.init(&local_value); + reducer.init(&local_value); - workgroup_scan<>(item, final_reducer, local_mem.get_pointer(), + workgroup_scan<>(item, reducer, local_mem.get_pointer(), local_value, wgroup_size); if (n_wgroups > 1 && local_id == wgroup_size - 1) @@ -175,12 +179,15 @@ class ParallelScanSYCLBase { cgh.parallel_for( sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), [=](sycl::nd_item<1> item) { - const auto global_id = item.get_global_linear_id(); - const FunctorType& functor = functor_wrapper.get_functor(); - typename Analysis::Reducer final_reducer(functor); + const auto global_id = item.get_global_linear_id(); + const CombinedFunctorReducer + functor_reducer = functor_wrapper.get_functor(); + const typename Analysis::Reducer& reducer = + functor_reducer.get_reducer(); if (global_id < size) - final_reducer.join(&global_mem[global_id], - &group_results[item.get_group_linear_id()]); + reducer.join(&global_mem[global_id], + &group_results[item.get_group_linear_id()]); }); }); q.ext_oneapi_submit_barrier( @@ -206,15 +213,18 @@ class ParallelScanSYCLBase { cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) { const typename Policy::index_type id = static_cast(item.get_id()) + begin; - const FunctorType& functor = functor_wrapper.get_functor(); - typename Analysis::Reducer final_reducer(functor); + const CombinedFunctorReducer& + functor_reducer = functor_wrapper.get_functor(); + const typename Analysis::Reducer& reducer = + functor_reducer.get_reducer(); value_type update{}; - final_reducer.init(&update); + reducer.init(&update); + const FunctorType& functor = functor_reducer.get_functor(); if constexpr (std::is_void::value) - functor_wrapper.get_functor()(id, update, false); + functor(id, update, false); else - functor_wrapper.get_functor()(WorkTag(), id, update, false); + functor(WorkTag(), id, update, false); global_mem[id] = update; }); }); @@ -235,10 +245,13 @@ class ParallelScanSYCLBase { auto global_id = item.get_id(0); value_type update = global_mem[global_id]; + const CombinedFunctorReducer& + functor_reducer = functor_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); if constexpr (std::is_void::value) - functor_wrapper.get_functor()(global_id, update, true); + functor(global_id, update, true); else - functor_wrapper.get_functor()(WorkTag(), global_id, update, true); + functor(WorkTag(), global_id, update, true); global_mem[global_id] = update; if (global_id == len - 1 && result_ptr_device_accessible) *result_ptr = update; @@ -283,7 +296,7 @@ class ParallelScanSYCLBase { indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); @@ -294,7 +307,7 @@ class ParallelScanSYCLBase { ParallelScanSYCLBase(const FunctorType& arg_functor, const Policy& arg_policy, pointer_type arg_result_ptr, bool arg_result_ptr_device_accessible) - : m_functor(arg_functor), + : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), m_result_ptr_device_accessible(arg_result_ptr_device_accessible), @@ -334,7 +347,7 @@ class ParallelScanWithTotal, Base::impl_execute([&]() { const long long nwork = Base::m_policy.end() - Base::m_policy.begin(); if (nwork > 0 && !Base::m_result_ptr_device_accessible) { - const int size = Base::Analysis::value_size(Base::m_functor); + const int size = Base::m_functor_reducer.get_reducer().value_size(); DeepCopy(m_exec, Base::m_result_ptr, Base::m_scratch_space + nwork - 1, From fb0b94cfa846aa397c8030ce3c4c952a82fe6032 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 6 Mar 2023 15:21:35 -0500 Subject: [PATCH 300/496] Fix OpenMPTarget::concurrency() --- core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp | 10 ++++++---- core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 564f299ab5..5f56e23144 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -65,11 +65,7 @@ void OpenMPTargetInternal::fence(const std::string& name, [&]() {}); } } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -int OpenMPTargetInternal::concurrency() { -#else int OpenMPTargetInternal::concurrency() const { -#endif return 128000; // FIXME_OPENMPTARGET } const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; } @@ -131,9 +127,15 @@ uint32_t OpenMPTarget::impl_instance_id() const noexcept { return m_space_instance->impl_get_instance_id(); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 int OpenMPTarget::concurrency() { return Impl::OpenMPTargetInternal::impl_singleton()->concurrency(); } +#else +int OpenMPTarget::concurrency() const { + return m_space_instance->concurrency(); +} +#endif void OpenMPTarget::fence(const std::string& name) { Impl::OpenMPTargetInternal::impl_singleton()->fence(name); diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp index 9f4349c00e..bea3bb3b12 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp @@ -37,7 +37,7 @@ class OpenMPTargetInternal { openmp_fence_is_static is_static = openmp_fence_is_static::no); /** \brief Return the maximum amount of concurrency. */ - int concurrency(); + int concurrency() const; //! Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose) const; From 73de258fd4adb950e5843a613747cdaf050d7a8c Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Mon, 6 Mar 2023 16:58:01 -0500 Subject: [PATCH 301/496] Add ScopedProfileRegion Co-authored-by: Damien L-G --- .../Kokkos_Profiling_ScopedProfileRegion.hpp | 48 ++++++++++++ core/unit_test/CMakeLists.txt | 1 + .../tools/TestScopedProfileRegion.cpp | 77 +++++++++++++++++++ 3 files changed, 126 insertions(+) create mode 100644 core/src/Kokkos_Profiling_ScopedProfileRegion.hpp create mode 100644 core/unit_test/tools/TestScopedProfileRegion.cpp diff --git a/core/src/Kokkos_Profiling_ScopedProfileRegion.hpp b/core/src/Kokkos_Profiling_ScopedProfileRegion.hpp new file mode 100644 index 0000000000..3540e9e23d --- /dev/null +++ b/core/src/Kokkos_Profiling_ScopedProfileRegion.hpp @@ -0,0 +1,48 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSP_SCOPED_PROFILE_REGION_HPP +#define KOKKOSP_SCOPED_PROFILE_REGION_HPP +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE_PROFILING_SCOPEDPROFILEREGION +#endif + +#include +#include + +#include + +namespace Kokkos::Profiling { + +class ScopedProfileRegion { + public: + ScopedProfileRegion(ScopedProfileRegion const &) = delete; + ScopedProfileRegion &operator=(ScopedProfileRegion const &) = delete; + + explicit ScopedProfileRegion(std::string const &name) { + Kokkos::Profiling::pushRegion(name); + } + ~ScopedProfileRegion() { Kokkos::Profiling::popRegion(); } +}; + +} // namespace Kokkos::Profiling + +#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE +#undef KOKKOS_IMPL_PUBLIC_INCLUDE +#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_PROFILING_SCOPEDPROFILEREGION +#endif +#endif diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 543f596b91..8bf3595f19 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -922,6 +922,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate tools/TestEventCorrectness.cpp tools/TestWithoutInitializing.cpp tools/TestProfilingSection.cpp + tools/TestScopedProfileRegion.cpp ) # FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22 diff --git a/core/unit_test/tools/TestScopedProfileRegion.cpp b/core/unit_test/tools/TestScopedProfileRegion.cpp new file mode 100644 index 0000000000..5eaa052ddb --- /dev/null +++ b/core/unit_test/tools/TestScopedProfileRegion.cpp @@ -0,0 +1,77 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +#include +#include + +namespace { + +std::stack test_region_stack; + +// NOTE: cannot use lambdas because they can only be converted to function +// pointers if they don't capture anything +void test_push_region(char const *label) { test_region_stack.push(label); } + +void test_pop_region() { test_region_stack.pop(); } + +TEST(defaultdevicetype, scoped_profile_region) { + Kokkos::Tools::Experimental::set_push_region_callback(test_push_region); + Kokkos::Tools::Experimental::set_pop_region_callback(test_pop_region); + + ASSERT_TRUE(test_region_stack.empty()); + + // Unnamed guard! Profile region is popped at the end of the statement. + Kokkos::Profiling::ScopedProfileRegion("bug"); + + ASSERT_TRUE(test_region_stack.empty()); + + { + std::string outer_identifier = "outer"; + Kokkos::Profiling::ScopedProfileRegion guard_outer(outer_identifier); + + ASSERT_EQ(test_region_stack.size(), 1u); + ASSERT_EQ(test_region_stack.top(), outer_identifier); + + { + std::string inner_identifier = "inner"; + Kokkos::Profiling::ScopedProfileRegion guard_inner(inner_identifier); + ASSERT_EQ(test_region_stack.size(), 2u); + ASSERT_EQ(test_region_stack.top(), inner_identifier); + } + + ASSERT_EQ(test_region_stack.size(), 1u); + ASSERT_EQ(test_region_stack.top(), outer_identifier); + } + + ASSERT_TRUE(test_region_stack.empty()); + + // Unset callbacks + Kokkos::Tools::Experimental::set_push_region_callback(nullptr); + Kokkos::Tools::Experimental::set_pop_region_callback(nullptr); +} + +using Kokkos::Profiling::ScopedProfileRegion; +static_assert(!std::is_default_constructible::value); +static_assert(!std::is_copy_constructible::value); +static_assert(!std::is_move_constructible::value); +static_assert(!std::is_copy_assignable::value); +static_assert(!std::is_move_assignable::value); + +} // namespace From 62fa442c759dd4288e2021c446189bddf163b36f Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Mon, 6 Mar 2023 18:11:13 -0500 Subject: [PATCH 302/496] Add [[nodiscard]] qualifiers Co-authored-by: Daniel Arndt --- core/src/Kokkos_Profiling_ScopedProfileRegion.hpp | 5 ++++- core/unit_test/tools/TestScopedProfileRegion.cpp | 5 ----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/core/src/Kokkos_Profiling_ScopedProfileRegion.hpp b/core/src/Kokkos_Profiling_ScopedProfileRegion.hpp index 3540e9e23d..4daeeadddb 100644 --- a/core/src/Kokkos_Profiling_ScopedProfileRegion.hpp +++ b/core/src/Kokkos_Profiling_ScopedProfileRegion.hpp @@ -28,11 +28,14 @@ namespace Kokkos::Profiling { -class ScopedProfileRegion { +class [[nodiscard]] ScopedProfileRegion { public: ScopedProfileRegion(ScopedProfileRegion const &) = delete; ScopedProfileRegion &operator=(ScopedProfileRegion const &) = delete; +#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 + [[nodiscard]] +#endif explicit ScopedProfileRegion(std::string const &name) { Kokkos::Profiling::pushRegion(name); } diff --git a/core/unit_test/tools/TestScopedProfileRegion.cpp b/core/unit_test/tools/TestScopedProfileRegion.cpp index 5eaa052ddb..8f703da626 100644 --- a/core/unit_test/tools/TestScopedProfileRegion.cpp +++ b/core/unit_test/tools/TestScopedProfileRegion.cpp @@ -37,11 +37,6 @@ TEST(defaultdevicetype, scoped_profile_region) { ASSERT_TRUE(test_region_stack.empty()); - // Unnamed guard! Profile region is popped at the end of the statement. - Kokkos::Profiling::ScopedProfileRegion("bug"); - - ASSERT_TRUE(test_region_stack.empty()); - { std::string outer_identifier = "outer"; Kokkos::Profiling::ScopedProfileRegion guard_outer(outer_identifier); From d7896e64fb3faaa7d8e4590b507c8f1c10d8e6ab Mon Sep 17 00:00:00 2001 From: tcclevenger Date: Mon, 16 Jan 2023 15:58:24 -0700 Subject: [PATCH 303/496] Add ParallelScanRangePolicy test Consolidate parallel_scan() tests using RangePolicy into one new file. Remove redundant tests. --- core/unit_test/CMakeLists.txt | 3 +- core/unit_test/Makefile | 20 +- .../unit_test/TestParallelScanRangePolicy.hpp | 253 ++++++++++++++++++ core/unit_test/TestRange.hpp | 84 +----- core/unit_test/TestRangePolicyRequire.hpp | 83 +----- core/unit_test/TestScan.hpp | 172 ------------ 6 files changed, 269 insertions(+), 346 deletions(-) create mode 100644 core/unit_test/TestParallelScanRangePolicy.hpp delete mode 100644 core/unit_test/TestScan.hpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 543f596b91..6a5598dff4 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -173,6 +173,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) MinMaxClamp NumericTraits Other + ParallelScanRangePolicy QuadPrecisionMath RangePolicy RangePolicyConstructors @@ -185,7 +186,6 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) Reducers_e Reductions Reductions_DeviceView - Scan SharedAlloc ) set(file ${dir}/Test${Tag}_${Name}.cpp) @@ -441,7 +441,6 @@ IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_e.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Scan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp diff --git a/core/unit_test/Makefile b/core/unit_test/Makefile index 05be225265..beafeccb78 100644 --- a/core/unit_test/Makefile +++ b/core/unit_test/Makefile @@ -52,8 +52,8 @@ ifneq ($(KOKKOS_INTERNAL_USE_RDYNAMIC), 1) KOKKOS_INTERNAL_USE_RDYNAMIC := $(call kokkos_has_string,$(CXXFLAGS),rdynamic) endif -ifeq ($(KOKKOS_INTERNAL_USE_RDYNAMIC),1) - ifneq ($(KOKKOS_INTERNAL_HAS_OPTIMIZATIONS),1) +ifeq ($(KOKKOS_INTERNAL_USE_RDYNAMIC),1) + ifneq ($(KOKKOS_INTERNAL_HAS_OPTIMIZATIONS),1) STACK_TRACE_TERMINATE_FILTER :=_dynamic else STACK_TRACE_TERMINATE_FILTER := @@ -62,7 +62,7 @@ else STACK_TRACE_TERMINATE_FILTER := endif -TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e Reductions_DeviceView Scan SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize +TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other ParallelScanRangePolicy RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e Reductions_DeviceView SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ @@ -127,7 +127,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) OBJ_CUDA += TestCuda_SubView_c07.o TestCuda_SubView_c08.o TestCuda_SubView_c09.o OBJ_CUDA += TestCuda_SubView_c10.o TestCuda_SubView_c11.o TestCuda_SubView_c12.o OBJ_CUDA += TestCuda_SubView_c13.o - OBJ_CUDA += TestCuda_Reductions.o TestCuda_Scan.o + OBJ_CUDA += TestCuda_Reductions.o TestCuda_ParallelScanRangePolicy.o OBJ_CUDA += TestCuda_Reductions_DeviceView.o OBJ_CUDA += TestCuda_Reducers_a.o TestCuda_Reducers_b.o TestCuda_Reducers_c.o TestCuda_Reducers_d.o TestCuda_Reducers_e.o OBJ_CUDA += TestCuda_Complex.o @@ -171,7 +171,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) OBJ_THREADS += TestThreads_SubView_c04.o TestThreads_SubView_c05.o TestThreads_SubView_c06.o OBJ_THREADS += TestThreads_SubView_c07.o TestThreads_SubView_c08.o TestThreads_SubView_c09.o OBJ_THREADS += TestThreads_SubView_c10.o TestThreads_SubView_c11.o TestThreads_SubView_c12.o - OBJ_THREADS += TestThreads_Reductions.o TestThreads_Scan.o + OBJ_THREADS += TestThreads_Reductions.o TestThreads_ParallelScanRangePolicy.o OBJ_THREADS += TestThreads_Reductions_DeviceView.o OBJ_THREADS += TestThreads_Reducers_a.o TestThreads_Reducers_b.o TestThreads_Reducers_c.o TestThreads_Reducers_d.o TestThreads_Reducers_e.o OBJ_THREADS += TestThreads_Complex.o @@ -207,7 +207,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) OBJ_OPENMP += TestOpenMP_SubView_c07.o TestOpenMP_SubView_c08.o TestOpenMP_SubView_c09.o OBJ_OPENMP += TestOpenMP_SubView_c10.o TestOpenMP_SubView_c11.o TestOpenMP_SubView_c12.o OBJ_OPENMP += TestOpenMP_SubView_c13.o - OBJ_OPENMP += TestOpenMP_Reductions.o TestOpenMP_Scan.o + OBJ_OPENMP += TestOpenMP_Reductions.o TestOpenMP_ParallelScanRangePolicy.o OBJ_OPENMP += TestOpenMP_Reductions_DeviceView.o OBJ_OPENMP += TestOpenMP_Reducers_a.o TestOpenMP_Reducers_b.o TestOpenMP_Reducers_c.o TestOpenMP_Reducers_d.o TestOpenMP_Reducers_e.o OBJ_OPENMP += TestOpenMP_Complex.o @@ -251,11 +251,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) #OBJ_OPENMPTARGET += TestOpenMPTarget_SubView_c10.o TestOpenMPTarget_SubView_c11.o TestOpenMPTarget_SubView_c12.o #OBJ_OPENMPTARGET += TestOpenMPTarget_Reductions.o # Need custom reductions OBJ_OPENMPTARGET += TestOpenMPTarget_Reducers_a.o TestOpenMPTarget_Reducers_b.o TestOpenMPTarget_Reducers_c.o TestOpenMPTarget_Reducers_d.o TestOpenMPTarget_Reducers_e.o - #OBJ_OPENMPTARGET += TestOpenMPTarget_Scan.o + OBJ_OPENMPTARGET += TestOpenMPTarget_ParallelScanRangePolicy.o OBJ_OPENMPTARGET += TestOpenMPTarget_Complex.o OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_int.o TestOpenMPTarget_AtomicOperations_unsignedint.o TestOpenMPTarget_AtomicOperations_longint.o OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_unsignedlongint.o TestOpenMPTarget_AtomicOperations_longlongint.o TestOpenMPTarget_AtomicOperations_double.o TestOpenMPTarget_AtomicOperations_float.o - #OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_complexfloat.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_complexfloat.o #OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_complexdouble.o OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicViews.o OBJ_OPENMPTARGET += TestOpenMPTarget_Atomics.o # Commented Out Arbitrary Type Atomics @@ -316,7 +316,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) OBJ_HPX += TestHPX_SubView_c10.o TestHPX_SubView_c11.o TestHPX_SubView_c12.o OBJ_HPX += TestHPX_SubView_c13.o OBJ_HPX += TestHPX_Reductions.o - OBJ_HPX += TestHPX_Scan.o + OBJ_HPX += TestHPX_ParallelScanRangePolicy.o OBJ_HPX += TestHPX_Reducers_a.o TestHPX_Reducers_b.o TestHPX_Reducers_c.o TestHPX_Reducers_d.o TestHPX_Reducers_e.o OBJ_HPX += TestHPX_Complex.o OBJ_HPX += TestHPX_AtomicOperations_int.o TestHPX_AtomicOperations_unsignedint.o TestHPX_AtomicOperations_longint.o @@ -355,7 +355,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) OBJ_SERIAL += TestSerial_SubView_c07.o TestSerial_SubView_c08.o TestSerial_SubView_c09.o OBJ_SERIAL += TestSerial_SubView_c10.o TestSerial_SubView_c11.o TestSerial_SubView_c12.o OBJ_SERIAL += TestSerial_SubView_c13.o - OBJ_SERIAL += TestSerial_Reductions.o TestSerial_Scan.o + OBJ_SERIAL += TestSerial_Reductions.o TestSerial_ParallelScanRangePolicy.o OBJ_SERIAL += TestSerial_Reductions_DeviceView.o OBJ_SERIAL += TestSerial_Reducers_a.o TestSerial_Reducers_b.o TestSerial_Reducers_c.o TestSerial_Reducers_d.o TestSerial_Reducers_e.o OBJ_SERIAL += TestSerial_Complex.o diff --git a/core/unit_test/TestParallelScanRangePolicy.hpp b/core/unit_test/TestParallelScanRangePolicy.hpp new file mode 100644 index 0000000000..6335b4a06f --- /dev/null +++ b/core/unit_test/TestParallelScanRangePolicy.hpp @@ -0,0 +1,253 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include +#include + +// This test checks parallel_scan() calls which use RangePolicy. + +namespace { + +template +struct TestParallelScanRangePolicy { + // This typedef is needed for parallel_scan() where a + // work count is given (instead of a RangePolicy) so + // that the execution space can be deduced internally. + using execution_space = TEST_EXECSPACE; + + using ViewType = Kokkos::View; + + ViewType prefix_results; + ViewType postfix_results; + + // Operator defining work done in parallel_scan. + // Simple scan over [0,1,...,N-1]. + // Compute both prefix and postfix scans. + KOKKOS_INLINE_FUNCTION + void operator()(const size_t i, ValueType& update, bool final_pass) const { + if (final_pass) { + prefix_results(i) = update; + } + update += i; + if (final_pass) { + postfix_results(i) = update; + } + } + + KOKKOS_INLINE_FUNCTION + void init(ValueType& update) const { update = 0; } + + KOKKOS_INLINE_FUNCTION + void join(ValueType& update, const ValueType& input) const { + update += input; + } + + template + void test_scan(const size_t work_size) { + // Reset member data based on work_size + prefix_results = ViewType("prefix_results", work_size); + postfix_results = ViewType("postfix_results", work_size); + + // Lambda for checking errors from stored value at each index. + auto check_scan_results = [&]() { + auto const prefix_h = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), prefix_results); + auto const postfix_h = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), postfix_results); + + for (size_t i = 0; i < work_size; ++i) { + // Check prefix sum + ASSERT_EQ(ValueType((i * (i - 1)) / 2), prefix_h(i)); + + // Check postfix sum + ASSERT_EQ(ValueType(((i + 1) * i) / 2), postfix_h(i)); + } + + // Reset results + Kokkos::deep_copy(prefix_results, 0); + Kokkos::deep_copy(postfix_results, 0); + }; + + // If policy template args are not given, call parallel_scan() + // with work_size input, if args are given, call + // parallel_scan() with RangePolicy(0, work_size). + // For each case, call parallel_scan() with all possible + // function signatures. + if (sizeof...(Args) == 0) { + // Input: label, work_count, functor + Kokkos::parallel_scan("TestWithStrArg1", work_size, *this); + check_scan_results(); + + // Input: work_count, functor + Kokkos::parallel_scan(work_size, *this); + check_scan_results(); + + // Input: label, work_count, functor + // Input/Output: return_value + { + ValueType return_val = 0; + Kokkos::parallel_scan("TestWithStrArg2", work_size, *this, return_val); + check_scan_results(); + ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2), + return_val); // sum( 0 .. N-1 ) + } + + // Input: work_count, functor + // Input/Output: return_value + { + ValueType return_val = 0; + Kokkos::parallel_scan(work_size, *this, return_val); + check_scan_results(); + ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2), + return_val); // sum( 0 .. N-1 ) + } + + // Input: work_count, functor + // Input/Output: return_view (host space) + { + Kokkos::View return_view("return_view"); + Kokkos::parallel_scan(work_size, *this, return_view); + check_scan_results(); + ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2), + return_view()); // sum( 0 .. N-1 ) + } + } else { + // Construct RangePolicy for parallel_scan + // based on template Args and work_size. + Kokkos::RangePolicy policy(0, work_size); + + // Input: label, work_count, functor + Kokkos::parallel_scan("TestWithStrArg3", policy, *this); + check_scan_results(); + + // Input: work_count, functor + Kokkos::parallel_scan(policy, *this); + check_scan_results(); + + { + // Input: label, work_count, functor + // Input/Output: return_value + ValueType return_val = 0; + Kokkos::parallel_scan("TestWithStrArg4", policy, *this, return_val); + check_scan_results(); + ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2), + return_val); // sum( 0 .. N-1 ) + } + + // Input: work_count, functor + // Input/Output: return_value + { + ValueType return_val = 0; + Kokkos::parallel_scan(policy, *this, return_val); + check_scan_results(); + ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2), + return_val); // sum( 0 .. N-1 ) + } + + // Input: work_count, functor + // Input/Output: return_view (Device) + { + Kokkos::View return_view("return_view"); + Kokkos::parallel_scan(policy, *this, return_view); + check_scan_results(); + + ValueType total; + Kokkos::deep_copy(total, return_view); + ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2), + total); // sum( 0 .. N-1 ) + } + + // Check Kokkos::Experimental::require() + // for one of the signatures. + { + using Property = + Kokkos::Experimental::WorkItemProperty::HintLightWeight_t; + const auto policy_with_require = + Kokkos::Experimental::require(policy, Property()); + + // Input: work_count, functor + // Input/Output: return_value + ValueType return_val = 0; + Kokkos::parallel_scan(policy_with_require, *this, return_val); + check_scan_results(); + ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2), + return_val); // sum( 0 .. N-1 ) + } + } + } + + // Run test_scan() for a collection of work size + template + void test_scan(const std::vector work_sizes) { + for (size_t i = 0; i < work_sizes.size(); ++i) { + test_scan(work_sizes[i]); + } + } +}; // struct TestParallelScanRangePolicy + +TEST(TEST_CATEGORY, parallel_scan_range_policy) { + { + TestParallelScanRangePolicy f; + + std::vector work_sizes{5, 10}; + f.test_scan<>(work_sizes); + f.test_scan>(work_sizes); + f.test_scan>(work_sizes); + } + { + TestParallelScanRangePolicy f; + + std::vector work_sizes{50, 100}; + f.test_scan<>(work_sizes); + f.test_scan>(work_sizes); + f.test_scan>(work_sizes); + } + { + TestParallelScanRangePolicy f; + + std::vector work_sizes{0, 1, 2, 1000, 1001}; + f.test_scan<>(work_sizes); + f.test_scan>(work_sizes); + f.test_scan>(work_sizes); + } + { + TestParallelScanRangePolicy f; + + std::vector work_sizes{1000, 10000}; + f.test_scan<>(work_sizes); + f.test_scan>(work_sizes); + f.test_scan>(work_sizes); + } + { + TestParallelScanRangePolicy f; + + std::vector work_sizes{13, 34}; + f.test_scan<>(work_sizes); + f.test_scan>(work_sizes); + f.test_scan>(work_sizes); + } + { + TestParallelScanRangePolicy f; + + std::vector work_sizes{17, 59}; + f.test_scan<>(work_sizes); + f.test_scan>(work_sizes); + f.test_scan>(work_sizes); + } +} +} // namespace diff --git a/core/unit_test/TestRange.hpp b/core/unit_test/TestRange.hpp index 0c465bdc1c..1648cb6ab4 100644 --- a/core/unit_test/TestRange.hpp +++ b/core/unit_test/TestRange.hpp @@ -203,55 +203,6 @@ struct TestRange { update += 1 + m_flags(i - offset); } - //---------------------------------------- - - void test_scan() { - Kokkos::parallel_for(Kokkos::RangePolicy(0, N), - *this); - - auto check_scan_results = [&]() { - auto const host_mirror = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result_view); - for (int i = 0; i < N; ++i) { - if (((i + 1) * i) / 2 != host_mirror(i)) { - std::cout << "Error at " << i << std::endl; - EXPECT_EQ(size_t(((i + 1) * i) / 2), size_t(host_mirror(i))); - } - } - }; - - Kokkos::parallel_scan( - "TestKernelScan", - Kokkos::RangePolicy(0, N), *this); - - check_scan_results(); - - value_type total = 0; - Kokkos::parallel_scan( - "TestKernelScanWithTotal", - Kokkos::RangePolicy(0, N), *this, - total); - - check_scan_results(); - - ASSERT_EQ(size_t((N - 1) * (N) / 2), size_t(total)); // sum( 0 .. N-1 ) - } - - KOKKOS_INLINE_FUNCTION - void operator()(const OffsetTag &, const int i, value_type &update, - bool final) const { - update += m_flags(i); - - if (final) { - if (update != (i * (i + 1)) / 2) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "TestRange::test_scan error (%d,%d) : %d != %d\n", i, m_flags(i), - (i * (i + 1)) / 2, update); - } - result_view(i) = update; - } - } - void test_dynamic_policy() { #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) auto const N_no_implicit_capture = N; @@ -404,49 +355,17 @@ TEST(TEST_CATEGORY, range_reduce) { } #ifndef KOKKOS_ENABLE_OPENMPTARGET -TEST(TEST_CATEGORY, range_scan) { - { - TestRange > f(0); - f.test_scan(); - } - { - TestRange > f(0); - f.test_scan(); - } +TEST(TEST_CATEGORY, range_dynamic_policy) { #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ !defined(KOKKOS_ENABLE_SYCL) { TestRange > f(0); f.test_dynamic_policy(); } -#endif - - { - TestRange > f(2); - f.test_scan(); - } - { - TestRange > f(3); - f.test_scan(); - } -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) { TestRange > f(3); f.test_dynamic_policy(); } -#endif - - { - TestRange > f(1000); - f.test_scan(); - } - { - TestRange > f(1001); - f.test_scan(); - } -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) { TestRange > f(1001); f.test_dynamic_policy(); @@ -454,4 +373,5 @@ TEST(TEST_CATEGORY, range_scan) { #endif } #endif + } // namespace Test diff --git a/core/unit_test/TestRangePolicyRequire.hpp b/core/unit_test/TestRangePolicyRequire.hpp index 5e578b2903..4057216288 100644 --- a/core/unit_test/TestRangePolicyRequire.hpp +++ b/core/unit_test/TestRangePolicyRequire.hpp @@ -214,43 +214,6 @@ struct TestRangeRequire { //---------------------------------------- - void test_scan() { - Kokkos::parallel_for( - Kokkos::Experimental::require( - Kokkos::RangePolicy(0, N), Property()), - *this); - - Kokkos::parallel_scan( - "TestKernelScan", - Kokkos::Experimental::require( - Kokkos::RangePolicy(0, N), - Property()), - *this); - - int total = 0; - Kokkos::parallel_scan( - "TestKernelScanWithTotal", - Kokkos::Experimental::require( - Kokkos::RangePolicy(0, N), - Property()), - *this, total); - ASSERT_EQ(size_t((N - 1) * (N) / 2), size_t(total)); // sum( 0 .. N-1 ) - } - - KOKKOS_INLINE_FUNCTION - void operator()(const OffsetTag &, const int i, value_type &update, - bool final) const { - update += m_flags(i); - - if (final) { - if (update != (i * (i + 1)) / 2) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "TestRangeRequire::test_scan error %d : %d != %d\n", i, - (i * (i + 1)) / 2, m_flags(i)); - } - } - } - void test_dynamic_policy() { #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) auto const N_no_implicit_capture = N; @@ -423,63 +386,22 @@ TEST(TEST_CATEGORY, range_reduce_require) { } #ifndef KOKKOS_ENABLE_OPENMPTARGET -TEST(TEST_CATEGORY, range_scan_require) { - using Property = Kokkos::Experimental::WorkItemProperty::HintLightWeight_t; - { - TestRangeRequire, Property> - f(0); - f.test_scan(); - } - { - TestRangeRequire, - Property> - f(0); - f.test_scan(); - } +TEST(TEST_CATEGORY, range_dynamic_policy_require) { #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ !defined(KOKKOS_ENABLE_SYCL) + using Property = Kokkos::Experimental::WorkItemProperty::HintLightWeight_t; { TestRangeRequire, Property> f(0); f.test_dynamic_policy(); } -#endif - - { - TestRangeRequire, Property> - f(2); - f.test_scan(); - } - { - TestRangeRequire, - Property> - f(3); - f.test_scan(); - } -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) { TestRangeRequire, Property> f(3); f.test_dynamic_policy(); } -#endif - - { - TestRangeRequire, Property> - f(1000); - f.test_scan(); - } - { - TestRangeRequire, - Property> - f(1001); - f.test_scan(); - } -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) { TestRangeRequire, Property> @@ -489,4 +411,5 @@ TEST(TEST_CATEGORY, range_scan_require) { #endif } #endif + } // namespace Test diff --git a/core/unit_test/TestScan.hpp b/core/unit_test/TestScan.hpp deleted file mode 100644 index 8c6a02f31f..0000000000 --- a/core/unit_test/TestScan.hpp +++ /dev/null @@ -1,172 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include - -namespace { - -template -struct TestScan { - using execution_space = Device; - using value_type = T; - - Kokkos::View > errors; - - KOKKOS_INLINE_FUNCTION - void operator()(const int iwork, value_type& update, - const bool final_pass) const { - const value_type n = iwork + 1; - const value_type imbalance = - ((ImbalanceSz <= n) && (value_type(0) == n % ImbalanceSz)) - ? ImbalanceSz - : value_type(0); - - // Insert an artificial load imbalance - - for (value_type i = 0; i < imbalance; ++i) { - ++update; - } - - update += n - imbalance; - - if (final_pass) { - const value_type answer = - n & 1 ? (n * ((n + 1) / 2)) : ((n / 2) * (n + 1)); - - if (answer != update) { - int fail = errors()++; - - if (fail < 20) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestScan(%d,%ld) != %ld\n", iwork, - static_cast(update), - static_cast(answer)); - } - } - } - } - - KOKKOS_INLINE_FUNCTION - void init(value_type& update) const { update = 0; } - - KOKKOS_INLINE_FUNCTION - void join(value_type& update, const value_type& input) const { - update += input; - } - - TestScan(const size_t N) { - Kokkos::View errors_a("Errors"); - Kokkos::deep_copy(errors_a, 0); - errors = errors_a; - - { - Kokkos::parallel_scan(N, *this); - check_error(); - } - - { - Kokkos::deep_copy(errors_a, 0); - value_type total = 0; - Kokkos::parallel_scan(N, *this, total); - - // We can't return a value in a constructor so use a lambda as wrapper to - // ignore it. - [&] { ASSERT_EQ(size_t((N + 1) * N / 2), size_t(total)); }(); - check_error(); - } - - { - Kokkos::deep_copy(errors_a, 0); - Kokkos::View total_view("total"); - Kokkos::parallel_scan(N, *this, total_view); - Kokkos::fence(); - - // We can't return a value in a constructor so use a lambda as wrapper to - // ignore it. - [&] { ASSERT_EQ(size_t((N + 1) * N / 2), size_t(total_view())); }(); - check_error(); - } - - { - Kokkos::deep_copy(errors_a, 0); - Kokkos::View total_view( - "total"); - typename Device::execution_space exec; - Kokkos::parallel_scan( - Kokkos::RangePolicy(exec, 0, N), - *this, total_view); - value_type total; - Kokkos::deep_copy(exec, total, total_view); - exec.fence(); - - // We can't return a value in a constructor so use a lambda as wrapper to - // ignore it. - [&] { ASSERT_EQ(size_t((N + 1) * N / 2), size_t(total)); }(); - check_error(); - } - } - - TestScan(const size_t Start, const size_t N) { - using exec_policy = Kokkos::RangePolicy; - - Kokkos::View errors_a("Errors"); - Kokkos::deep_copy(errors_a, 0); - errors = errors_a; - - Kokkos::parallel_scan(exec_policy(Start, N), *this); - Kokkos::fence(); - - check_error(); - } - - void check_error() { - int total_errors; - Kokkos::deep_copy(total_errors, errors); - ASSERT_EQ(total_errors, 0); - } - - static void test_range(const size_t begin, const size_t end) { - for (auto i = begin; i < end; ++i) { - (void)TestScan(i); - } - } -}; -} // namespace - -TEST(TEST_CATEGORY, scan) { - constexpr auto imbalance_size = 1000; - TestScan::test_range(1, 1000); - TestScan(0); - TestScan(100000); - TestScan(10000000); -} - -TEST(TEST_CATEGORY, small_size_scan) { - constexpr auto imbalance_size = 10; // Pick to not overflow... - TestScan(0); - TestScan(5); - TestScan(10); - TestScan( - static_cast( - std::sqrt(std::numeric_limits::max()))); - constexpr auto short_imbalance_size = 100; // Pick to not overflow... - TestScan(0); - TestScan(5); - TestScan(100); - TestScan( - static_cast( - std::sqrt(std::numeric_limits::max()))); -} From 9b18550616f8f9e814ea9282c7a49025d7c274cb Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Tue, 7 Mar 2023 13:42:49 -0500 Subject: [PATCH 304/496] Address review comments Co-authored-by: Daniel Arndt --- algorithms/src/Kokkos_Sort.hpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/algorithms/src/Kokkos_Sort.hpp b/algorithms/src/Kokkos_Sort.hpp index 53f230791f..49fc7d5225 100644 --- a/algorithms/src/Kokkos_Sort.hpp +++ b/algorithms/src/Kokkos_Sort.hpp @@ -457,23 +457,13 @@ class BinSort { auto bin_size = bin_count_const(i); if (bin_size <= 1) return; constexpr bool use_std_sort = -#ifdef KOKKOS_ENABLE_SERIAL - std::is_same_v || -#endif -#ifdef KOKKOS_ENABLE_OPENMP - std::is_same_v || -#endif - false; + std::is_same_v; int lower_bound = bin_offsets(i); int upper_bound = lower_bound + bin_size; if (use_std_sort && bin_size > 10) { if constexpr (use_std_sort) { - auto& bin_op_c = bin_op; - auto& keys_rnd_c = keys_rnd; std::sort(&sort_order(lower_bound), &sort_order(upper_bound), - [&bin_op_c, &keys_rnd_c](int p, int q) { - return bin_op_c(keys_rnd_c, p, q); - }); + [this](int p, int q) { return bin_op(keys_rnd, p, q); }); } } else { for (int k = lower_bound + 1; k < upper_bound; ++k) { From 2e667d8c28028a7d55c36af663cdc29dcdd384d8 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 7 Mar 2023 19:36:12 +0000 Subject: [PATCH 305/496] Fix partition_master test --- core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp b/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp index 6983fabb2a..92b8032bf0 100644 --- a/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp +++ b/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp @@ -29,7 +29,7 @@ TEST(openmp, partition_master) { int errors = 0; auto master = [&errors, &mtx](int /*partition_id*/, int /*num_partitions*/) { - const int pool_size = Kokkos::OpenMP::impl_thread_pool_size(); + const int pool_size = Kokkos::OpenMP().impl_thread_pool_size(); { std::unique_lock lock(mtx); @@ -46,7 +46,7 @@ TEST(openmp, partition_master) { Kokkos::parallel_reduce( Kokkos::RangePolicy(0, 1000), [pool_size](const int, int& errs) { - if (Kokkos::OpenMP::impl_thread_pool_size() != pool_size) { + if (Kokkos::OpenMP().impl_thread_pool_size() != pool_size) { ++errs; } }, From 2b035de21dfcd292f9d6ef6ccdaa351ef538341e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 7 Mar 2023 13:47:39 -0500 Subject: [PATCH 306/496] Use CombinedReducer in HostIterateTile --- core/src/HPX/Kokkos_HPX.hpp | 13 ++++----- core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp | 28 +++++++++---------- .../Serial/Kokkos_Serial_Parallel_MDRange.hpp | 16 +++++------ .../Kokkos_Threads_Parallel_MDRange.hpp | 20 ++++++------- core/src/impl/KokkosExp_Host_IterateTile.hpp | 4 +-- 5 files changed, 37 insertions(+), 44 deletions(-) diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp index ee03fbfe58..4781870a72 100644 --- a/core/src/HPX/Kokkos_HPX.hpp +++ b/core/src/HPX/Kokkos_HPX.hpp @@ -1105,19 +1105,17 @@ class ParallelReduce; + using iterate_type = typename Kokkos::Impl::HostIterateTile< + MDRangePolicy, CombinedFunctorReducerType, WorkTag, reference_type>; const iterate_type m_iter; const Policy m_policy; - const CombinedFunctorReducerType m_functor_reducer; const pointer_type m_result_ptr; const bool m_force_synchronous; public: void setup() const { - const ReducerType &reducer = m_functor_reducer.get_reducer(); + const ReducerType &reducer = m_iter.m_func.get_reducer(); const std::size_t value_size = reducer.value_size(); const int num_worker_threads = m_policy.space().concurrency(); @@ -1143,7 +1141,7 @@ class ParallelReduce(buffer.get(0)), @@ -1175,9 +1173,8 @@ class ParallelReduce inline ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer, MDRangePolicy arg_policy, const ViewType &arg_view) - : m_iter(arg_policy, arg_functor_reducer.get_functor()), + : m_iter(arg_policy, arg_functor_reducer), m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)), - m_functor_reducer(arg_functor_reducer), m_result_ptr(arg_view.data()), m_force_synchronous(!arg_view.impl_track().has_record()) {} diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp index af1ba9543b..c3dfb69f59 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp @@ -467,13 +467,11 @@ class ParallelReduce; + using iterate_type = typename Kokkos::Impl::HostIterateTile< + MDRangePolicy, CombinedFunctorReducerType, WorkTag, reference_type>; OpenMPInternal* m_instance; const iterate_type m_iter; - const ReducerType m_reducer; const pointer_type m_result_ptr; inline void exec_range(const Member ibeg, const Member iend, @@ -485,7 +483,8 @@ class ParallelReduceacquire_lock(); @@ -504,11 +503,11 @@ class ParallelReduceget_thread_data(0)->pool_reduce_local()); - reference_type update = m_reducer.init(ptr); + reference_type update = reducer.init(ptr); ParallelReduce::exec_range(0, m_iter.m_rp.m_num_tiles, update); - m_reducer.final(ptr); + reducer.final(ptr); m_instance->release_lock(); @@ -533,7 +532,7 @@ class ParallelReduce(data.pool_reduce_local())); std::pair range(0, 0); @@ -554,15 +553,15 @@ class ParallelReduceget_thread_data(0)->pool_reduce_local()); for (int i = 1; i < pool_size; ++i) { - m_reducer.join(ptr, - reinterpret_cast( - m_instance->get_thread_data(i)->pool_reduce_local())); + reducer.join(ptr, + reinterpret_cast( + m_instance->get_thread_data(i)->pool_reduce_local())); } - m_reducer.final(ptr); + reducer.final(ptr); if (m_result_ptr) { - const int n = m_reducer.value_count(); + const int n = reducer.value_count(); for (int j = 0; j < n; ++j) { m_result_ptr[j] = ptr[j]; @@ -578,8 +577,7 @@ class ParallelReduce; + using iterate_type = typename Kokkos::Impl::HostIterateTile< + MDRangePolicy, CombinedFunctorReducerType, WorkTag, reference_type>; const iterate_type m_iter; - const ReducerType m_reducer; const pointer_type m_result_ptr; inline void exec(reference_type update) const { @@ -98,7 +96,8 @@ class ParallelReducem_thread_team_data.pool_reduce_local()); - reference_type update = m_reducer.init(ptr); + reference_type update = reducer.init(ptr); this->exec(update); - m_reducer.final(ptr); + reducer.final(ptr); } template ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, const MDRangePolicy& arg_policy, const ViewType& arg_result_view) - : m_iter(arg_policy, arg_functor_reducer.get_functor()), - m_reducer(arg_functor_reducer.get_reducer()), + : m_iter(arg_policy, arg_functor_reducer), m_result_ptr(arg_result_view.data()) { static_assert(Kokkos::is_view::value, "Kokkos::Serial reduce result must be a View"); diff --git a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp b/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp index 3ba8d27f5c..9d06249082 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp @@ -128,12 +128,10 @@ class ParallelReduce; + using iterate_type = typename Kokkos::Impl::HostIterateTile< + MDRangePolicy, CombinedFunctorReducerType, WorkTag, reference_type>; const iterate_type m_iter; - const ReducerType m_reducer; const pointer_type m_result_ptr; inline void exec_range(const Member &ibeg, const Member &iend, @@ -156,11 +154,12 @@ class ParallelReduce(exec.reduce_memory()))); + reducer.init(static_cast(exec.reduce_memory()))); - exec.fan_in_reduce(self.m_reducer); + exec.fan_in_reduce(reducer); } template @@ -178,6 +177,7 @@ class ParallelReduce(exec.reduce_memory())); while (work_index != -1) { @@ -192,7 +192,8 @@ class ParallelReduce::value, "Kokkos::Threads reduce result must be a View"); diff --git a/core/src/impl/KokkosExp_Host_IterateTile.hpp b/core/src/impl/KokkosExp_Host_IterateTile.hpp index 82604a24c2..a44ffefa6b 100644 --- a/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -2093,8 +2093,8 @@ struct HostIterateTile::apply(val, m_func, full_tile, m_offset, m_rp.m_tile, - m_tiledims); + Tag>::apply(val, m_func.get_functor(), full_tile, m_offset, + m_rp.m_tile, m_tiledims); } #else From 4f18b1976412cff07d7836c35f2f57f0192dfde9 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 7 Mar 2023 21:45:57 -0500 Subject: [PATCH 307/496] Desul atomics: fix bug max uint64_t value --- tpls/desul/include/desul/atomics/Common.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Common.hpp b/tpls/desul/include/desul/atomics/Common.hpp index b8dfcb8acd..fb36ac3566 100644 --- a/tpls/desul/include/desul/atomics/Common.hpp +++ b/tpls/desul/include/desul/atomics/Common.hpp @@ -83,11 +83,11 @@ struct numeric_limits_max; template <> struct numeric_limits_max { - static constexpr uint32_t value = 0xffffffffu; + static constexpr uint32_t value = -1; }; template <> struct numeric_limits_max { - static constexpr uint64_t value = 0xfffffffflu; + static constexpr uint64_t value = -1; }; constexpr bool atomic_always_lock_free(std::size_t size) { From 1d26ca8d353e6e739ddc2c800235539b942c7b98 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 7 Mar 2023 22:05:01 -0500 Subject: [PATCH 308/496] Make CUDA bhalf conversion code more forward compatible Introduce macro for NVIDIA GPU Ampere+ architectute (CC >= 8.X) so new architectures don't need to be listed as they come. --- core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp index eaa6263c40..df98934ab3 100644 --- a/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp @@ -260,10 +260,16 @@ KOKKOS_INLINE_FUNCTION /************************** bhalf conversions *********************************/ // Go in this branch if CUDA version is >= 11.0.0 and less than 11.1.0 or if the -// architecture is not Ampere +// architecture is older than Ampere +#if !defined(KOKKOS_ARCH_KEPLER) && !defined(KOKKOS_ARCH_MAXWELL) && \ + !defined(KOKKOS_ARCH_PASCAL) && !defined(KOKKOS_ARCH_VOLTA) && \ + !defined(KOKKOS_ARCH_TURING75) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU_AMPERE_PLUS +#endif + #if CUDA_VERSION >= 11000 && \ (CUDA_VERSION < 11010 || \ - !(defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER))) + !defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU_AMPERE_PLUS)) KOKKOS_INLINE_FUNCTION bhalf_t cast_to_bhalf(bhalf_t val) { return val; } @@ -390,8 +396,7 @@ KOKKOS_INLINE_FUNCTION } #endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010 -#if CUDA_VERSION >= 11010 && \ - ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER))) +#if CUDA_VERSION >= 11010 && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU_AMPERE_PLUS) KOKKOS_INLINE_FUNCTION bhalf_t cast_to_bhalf(bhalf_t val) { return val; } KOKKOS_INLINE_FUNCTION @@ -473,6 +478,8 @@ KOKKOS_INLINE_FUNCTION return static_cast(cast_from_bhalf(val)); } #endif // CUDA_VERSION >= 11010 + +#undef KOKKOS_IMPL_ARCH_NVIDIA_GPU_AMPERE_PLUS } // namespace Experimental #if (CUDA_VERSION >= 11000) From 0e302f6d83d7b899d300b034ea466b166e6c540f Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Wed, 8 Mar 2023 05:32:15 -0800 Subject: [PATCH 309/496] Drop Kokkos_ARCH_NATIVE=ON because it breaks with ccache Github CI runners seem to have various CPU capabilities, probably differing on AVX-512 support. This causes many jobs to fail with `-march=native` if they get a cached object that doesn't match the current runner. --- .github/workflows/continuous-integration-workflow.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 9b62c42931..1f06dcd535 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -90,7 +90,6 @@ jobs: -DCMAKE_INSTALL_PREFIX=/usr \ ${{ matrix.clang-tidy }} \ -Ddesul_ROOT=/usr/desul-install/ \ - -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_DESUL_ATOMICS_EXTERNAL=ON \ -DKokkos_ENABLE_HWLOC=ON \ -DKokkos_ENABLE_${{ matrix.backend }}=ON \ From f46889d2f3c17e403c8753b51aabc09177feee82 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Wed, 8 Mar 2023 05:33:08 -0800 Subject: [PATCH 310/496] Drop native from HPX builds --- .github/workflows/continuous-integration-workflow-hpx.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/continuous-integration-workflow-hpx.yml b/.github/workflows/continuous-integration-workflow-hpx.yml index e4584aa492..0c7abd2fc1 100644 --- a/.github/workflows/continuous-integration-workflow-hpx.yml +++ b/.github/workflows/continuous-integration-workflow-hpx.yml @@ -69,7 +69,6 @@ jobs: -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CXX_FLAGS="-Werror" \ -DHPX_ROOT=$PWD/../../hpx/install \ - -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -DKokkos_ENABLE_EXAMPLES=ON \ From e0eacdd67b5b32977d55f4d947a3eaedd16d1ca6 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Wed, 8 Mar 2023 05:33:37 -0800 Subject: [PATCH 311/496] Drop native from macOS build --- .github/workflows/osx.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 03fbcf37f6..0ff3266848 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -31,7 +31,6 @@ jobs: -DKokkos_ENABLE_${{ matrix.backend }}=On -DCMAKE_CXX_FLAGS="-Werror" -DCMAKE_CXX_STANDARD=17 - -DKokkos_ARCH_NATIVE=ON -DKokkos_ENABLE_COMPILER_WARNINGS=ON -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF -DKokkos_ENABLE_TESTS=On From a7ac0453c350ce8e20cc45cc73c251915f4a76f2 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Wed, 8 Mar 2023 05:34:03 -0800 Subject: [PATCH 312/496] Drop native from performance benchmark build --- .github/workflows/performance-benchmark.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 9db41c0dca..205239e043 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -29,7 +29,6 @@ jobs: - name: Configure Kokkos run: | cmake -B builddir \ - -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_HWLOC=ON \ -DKokkos_ENABLE_${{ matrix.backend }}=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ From f670cae472c7ae245524527db3a3f2722426847d Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 8 Mar 2023 08:33:26 -0500 Subject: [PATCH 313/496] Let KOKKOS_ARCH_NVIDIA_GPU provide the Compute Capability --- Makefile.kokkos | 2 -- cmake/KokkosCore_config.h.in | 1 - cmake/kokkos_arch.cmake | 1 - core/src/setup/Kokkos_Setup_Cuda.hpp | 36 ++++++++++++++++++++++++++++ 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 11358b32f4..a885a640ab 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1055,8 +1055,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) endif ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NVIDIA_GPU") - KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 0b5f44b39d..c8257d8664 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -108,7 +108,6 @@ #cmakedefine KOKKOS_ARCH_AMPERE86 #cmakedefine KOKKOS_ARCH_HOPPER #cmakedefine KOKKOS_ARCH_HOPPER90 -#cmakedefine KOKKOS_ARCH_NVIDIA_GPU #cmakedefine KOKKOS_ARCH_AMD_ZEN #cmakedefine KOKKOS_ARCH_AMD_ZEN2 #cmakedefine KOKKOS_ARCH_AMD_ZEN3 diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 4e69092428..2187f99352 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -566,7 +566,6 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) IF(CUDA_ARCH_ALREADY_SPECIFIED) MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") ENDIF() - SET(KOKKOS_ARCH_NVIDIA_GPU ON PARENT_SCOPE) SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL AND NOT KOKKOS_ENABLE_OPENACC) MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") diff --git a/core/src/setup/Kokkos_Setup_Cuda.hpp b/core/src/setup/Kokkos_Setup_Cuda.hpp index c57f690ae1..637855eb0d 100644 --- a/core/src/setup/Kokkos_Setup_Cuda.hpp +++ b/core/src/setup/Kokkos_Setup_Cuda.hpp @@ -69,4 +69,40 @@ #define KOKKOS_IMPL_HOST_FUNCTION __host__ #define KOKKOS_IMPL_DEVICE_FUNCTION __device__ +#if defined(KOKKOS_ARCH_KEPLER30) +#define KOKKOS_ARCH_NVIDIA_GPU 30 +#elif defined(KOKKOS_ARCH_KEPLER32) +#define KOKKOS_ARCH_NVIDIA_GPU 32 +#elif defined(KOKKOS_ARCH_KEPLER35) +#define KOKKOS_ARCH_NVIDIA_GPU 35 +#elif defined(KOKKOS_ARCH_KEPLER37) +#define KOKKOS_ARCH_NVIDIA_GPU 37 +#elif defined(KOKKOS_ARCH_MAXWELL50) +#define KOKKOS_ARCH_NVIDIA_GPU 50 +#elif defined(KOKKOS_ARCH_MAXWELL52) +#define KOKKOS_ARCH_NVIDIA_GPU 52 +#elif defined(KOKKOS_ARCH_MAXWELL53) +#define KOKKOS_ARCH_NVIDIA_GPU 53 +#elif defined(KOKKOS_ARCH_PASCAL60) +#define KOKKOS_ARCH_NVIDIA_GPU 60 +#elif defined(KOKKOS_ARCH_PASCAL61) +#define KOKKOS_ARCH_NVIDIA_GPU 61 +#elif defined(KOKKOS_ARCH_VOLTA70) +#define KOKKOS_ARCH_NVIDIA_GPU 70 +#elif defined(KOKKOS_ARCH_VOLTA72) +#define KOKKOS_ARCH_NVIDIA_GPU 72 +#elif defined(KOKKOS_ARCH_TURING75) +#define KOKKOS_ARCH_NVIDIA_GPU 75 +#elif defined(KOKKOS_ARCH_AMPERE80) +#define KOKKOS_ARCH_NVIDIA_GPU 80 +#elif defined(KOKKOS_ARCH_AMPERE86) +#define KOKKOS_ARCH_NVIDIA_GPU 86 +#elif defined(KOKKOS_ARCH_ADA89) +#define KOKKOS_ARCH_NVIDIA_GPU 89 +#elif defined(KOKKOS_ARCH_HOPPER90) +#define KOKKOS_ARCH_NVIDIA_GPU 90 +#else +#error NVIDIA GPU arch not recognized +#endif + #endif /* KOKKOS_CUDA_SETUP_HPP_ */ From 6e29e92369f0eb8be035e24449807f9b7cf25cb8 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 21 Feb 2023 15:10:30 -0500 Subject: [PATCH 314/496] Convert OpenMPTarget ParallelReduce --- core/src/Kokkos_Parallel_Reduce.hpp | 6 ++ ...kkos_OpenMPTarget_ParallelReduce_Range.hpp | 74 ++++++--------- ...okkos_OpenMPTarget_ParallelReduce_Team.hpp | 91 +++++++------------ .../Kokkos_OpenMPTarget_Parallel_MDRange.hpp | 63 +++++-------- 4 files changed, 92 insertions(+), 142 deletions(-) diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index 2a3c39cbba..7fe539c4c6 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -1437,6 +1437,12 @@ template <> struct implements_new_reduce_interface : std::true_type {}; #endif +#ifdef KOKKOS_ENABLE_OPENMPTARGET +template <> +struct implements_new_reduce_interface + : std::true_type {}; +#endif + #ifdef KOKKOS_ENABLE_CUDA template <> struct implements_new_reduce_interface : std::true_type {}; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 9153402596..e12240208e 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -25,36 +25,33 @@ namespace Kokkos { namespace Impl { -template -class ParallelReduce, ReducerType, +template +class ParallelReduce, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::RangePolicy; + using Policy = Kokkos::RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; using WorkTag = typename Policy::work_tag; - using ReducerTypeFwd = - std::conditional_t::value, - FunctorType, ReducerType>; - using Analysis = Impl::FunctorAnalysis; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - - static constexpr int HasJoin = + static constexpr int FunctorHasJoin = Impl::FunctorAnalysis::has_join_member_function; - static constexpr int UseReducer = is_reducer::value; - static constexpr int IsArray = std::is_pointer::value; + static constexpr int UseReducer = + !std::is_same_v; + static constexpr int IsArray = std::is_pointer::value; using ParReduceSpecialize = - ParallelReduceSpecialize; + ParallelReduceSpecialize; - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; @@ -62,68 +59,53 @@ class ParallelReduce, ReducerType, public: void execute() const { - if constexpr (HasJoin) { + const FunctorType& functor = m_functor_reducer.get_functor(); + if constexpr (FunctorHasJoin) { // Enter this loop if the Functor has a init-join. - ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr, + ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if constexpr (UseReducer) { // Enter this loop if the Functor is a reducer type. - ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr, + ParReduceSpecialize::execute_reducer(functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if constexpr (IsArray) { // Enter this loop if the reduction is on an array and the routine is // templated over the size of the array. if (m_result_ptr_num_elems <= 2) { ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 4) { ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 8) { ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 16) { ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 32) { ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else { Kokkos::abort("array reduction length must be <= 32"); } } else { // This loop handles the basic scalar reduction. ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + functor, m_policy, m_result_ptr, m_result_ptr_on_device); } } template - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ViewType& arg_result_view, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = nullptr) - : m_functor(arg_functor), + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result_view) + : m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result_view.data()), m_result_ptr_on_device( MemorySpaceAccess::accessible), m_result_ptr_num_elems(arg_result_view.size()) {} - - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ReducerType& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_result_ptr_num_elems(reducer.view().size()) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index 39d452864a..417a53505f 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -432,113 +432,92 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< namespace Impl { -template -class ParallelReduce, - ReducerType, Kokkos::Experimental::OpenMPTarget> { +template +class ParallelReduce, + Kokkos::Experimental::OpenMPTarget> { private: using Policy = Kokkos::Impl::TeamPolicyInternal; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; - using ReducerTypeFwd = - std::conditional_t::value, - FunctorType, ReducerType>; - using WorkTagFwd = - std::conditional_t::value, WorkTag, - void>; - using Analysis = Impl::FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - using value_type = typename Analysis::value_type; + + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + using value_type = typename ReducerType::value_type; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; - static constexpr int HasJoin = + static constexpr int FunctorHasJoin = Impl::FunctorAnalysis::has_join_member_function; - static constexpr int UseReducer = is_reducer::value; - static constexpr int IsArray = std::is_pointer::value; + static constexpr int UseReducer = + !std::is_same_v; + static constexpr int IsArray = std::is_pointer::value; using ParReduceSpecialize = - ParallelReduceSpecialize; + ParallelReduceSpecialize; - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; const pointer_type m_result_ptr; const size_t m_shmem_size; public: void execute() const { - if constexpr (HasJoin) { - ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr, + const FunctorType& functor = m_functor_reducer.get_functor(); + if constexpr (FunctorHasJoin) { + ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if constexpr (UseReducer) { - ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr, + ParReduceSpecialize::execute_reducer(functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if constexpr (IsArray) { if (m_result_ptr_num_elems <= 2) { ParReduceSpecialize::template execute_array<2>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 4) { ParReduceSpecialize::template execute_array<4>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 8) { ParReduceSpecialize::template execute_array<8>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 16) { ParReduceSpecialize::template execute_array<16>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 32) { ParReduceSpecialize::template execute_array<32>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else { Kokkos::abort("array reduction length must be <= 32"); } } else { ParReduceSpecialize::template execute_array<1>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + functor, m_policy, m_result_ptr, m_result_ptr_on_device); } } template - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ViewType& arg_result, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = nullptr) + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) : m_result_ptr_on_device( MemorySpaceAccess::accessible), m_result_ptr_num_elems(arg_result.size()), - m_functor(arg_functor), + m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr(arg_result.data()), - m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, arg_policy.team_size())) {} - - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ReducerType& reducer) - : m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_result_ptr_num_elems(reducer.view().size()), - m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, arg_policy.team_size())) {} + m_shmem_size( + arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize::value( + arg_functor_reducer.get_functor(), arg_policy.team_size())) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp index 251ca20b44..41e62ce6e6 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp @@ -411,32 +411,28 @@ class ParallelFor, namespace Kokkos { namespace Impl { -template -class ParallelReduce, ReducerType, +template +class ParallelReduce, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::MDRangePolicy; + using Policy = Kokkos::MDRangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using Index = typename Policy::index_type; - using ReducerConditional = - std::conditional::value, - FunctorType, ReducerType>; - using ReducerTypeFwd = typename ReducerConditional::type; - using Analysis = Impl::FunctorAnalysis; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - - static constexpr bool UseReducer = is_reducer::value; + static constexpr bool UseReducer = + !std::is_same_v; const pointer_type m_result_ptr; - const FunctorType m_functor; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; - const ReducerType m_reducer; using ParReduceCopy = ParallelReduceCopy; @@ -444,36 +440,20 @@ class ParallelReduce, ReducerType, public: inline void execute() const { - execute_tile( - m_functor, m_policy, m_result_ptr); + execute_tile( + m_functor_reducer.get_functor(), m_policy, m_result_ptr); } template - inline ParallelReduce( - const FunctorType& arg_functor, Policy arg_policy, - const ViewType& arg_result_view, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = NULL) + inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + Policy arg_policy, const ViewType& arg_result_view) : m_result_ptr(arg_result_view.data()), - m_functor(arg_functor), + m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), - m_reducer(InvalidType()), m_result_ptr_on_device( MemorySpaceAccess::accessible) {} - inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, - const ReducerType& reducer) - : m_result_ptr(reducer.view().data()), - m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr_on_device( - MemorySpaceAccess::accessible) {} - template inline std::enable_if_t execute_tile(const FunctorType& functor, const Policy& policy, @@ -540,10 +520,13 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join( \ + omp_out, omp_in)) \ + initializer( \ + OpenMPTargetReducerWrapper ::init( \ + omp_priv)) #pragma omp target teams distribute parallel for collapse(3) map(to \ : functor) \ From 42abe36647ed1e97af6d1828420f1181b560a4ff Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Sat, 4 Mar 2023 10:44:08 +0100 Subject: [PATCH 315/496] Convert OpenMPTarget ParallelScan --- ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 43 +++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index 1900260e2a..e9a52f8e21 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -41,7 +41,8 @@ class ParallelScan, using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; - const FunctorType m_functor; + const CombinedFunctorReducer + m_functor_reducer; const Policy m_policy; value_type* m_result_ptr; @@ -75,10 +76,12 @@ class ParallelScan, idx_type nteams = n_chunks > 512 ? 512 : n_chunks; idx_type team_size = 128; - FunctorType a_functor(m_functor); -#pragma omp target teams distribute map(to : a_functor) num_teams(nteams) + auto a_functor_reducer = m_functor_reducer; +#pragma omp target teams distribute map(to \ + : a_functor_reducer) num_teams(nteams) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { - typename Analysis::Reducer final_reducer(a_functor); + const typename Analysis::Reducer& reducer = + a_functor_reducer.get_reducer(); #pragma omp parallel num_threads(team_size) { const idx_type local_offset = team_id * chunk_size; @@ -87,16 +90,18 @@ class ParallelScan, for (idx_type i = 0; i < chunk_size; ++i) { const idx_type idx = local_offset + i; value_type val; - final_reducer.init(&val); - if (idx < N) call_with_tag(a_functor, idx, val, false); + reducer.init(&val); + if (idx < N) + call_with_tag(a_functor_reducer.get_functor(), idx, val, + false); element_values(team_id, i) = val; } #pragma omp barrier if (omp_get_thread_num() == 0) { value_type sum; - final_reducer.init(&sum); + reducer.init(&sum); for (idx_type i = 0; i < chunk_size; ++i) { - final_reducer.join(&sum, &element_values(team_id, i)); + reducer.join(&sum, &element_values(team_id, i)); element_values(team_id, i) = sum; } chunk_values(team_id) = sum; @@ -105,9 +110,9 @@ class ParallelScan, if (omp_get_thread_num() == 0) { if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) { value_type sum; - final_reducer.init(&sum); + reducer.init(&sum); for (idx_type i = 0; i < n_chunks; ++i) { - final_reducer.join(&sum, &chunk_values(i)); + reducer.join(&sum, &chunk_values(i)); chunk_values(i) = sum; } } @@ -115,11 +120,12 @@ class ParallelScan, } } -#pragma omp target teams distribute map(to \ - : a_functor) num_teams(nteams) \ +#pragma omp target teams distribute map(to \ + : a_functor_reducer) num_teams(nteams) \ thread_limit(team_size) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { - typename Analysis::Reducer final_reducer(a_functor); + const typename Analysis::Reducer& reducer = + a_functor_reducer.get_reducer(); #pragma omp parallel num_threads(team_size) { const idx_type local_offset = team_id * chunk_size; @@ -127,7 +133,7 @@ class ParallelScan, if (team_id > 0) offset_value = chunk_values(team_id - 1); else - final_reducer.init(&offset_value); + reducer.init(&offset_value); #pragma omp for for (idx_type i = 0; i < chunk_size; ++i) { @@ -145,12 +151,13 @@ class ParallelScan, } else local_offset_value += offset_value; #else - final_reducer.join(&local_offset_value, &offset_value); + reducer.join(&local_offset_value, &offset_value); #endif } else local_offset_value = offset_value; if (idx < N) - call_with_tag(a_functor, idx, local_offset_value, true); + call_with_tag(a_functor_reducer.get_functor(), idx, + local_offset_value, true); if (idx == N - 1 && m_result_ptr_device_accessible) *m_result_ptr = local_offset_value; } @@ -184,7 +191,7 @@ class ParallelScan, ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy, pointer_type arg_result_ptr = nullptr, bool arg_result_ptr_device_accessible = false) - : m_functor(arg_functor), + : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} @@ -227,7 +234,7 @@ class ParallelScanWithTotal, base_t::impl_execute(element_values, chunk_values, count); if (!base_t::m_result_ptr_device_accessible) { - const int size = base_t::Analysis::value_size(base_t::m_functor); + const int size = base_t::m_functor_reducer.get_reducer().value_size(); DeepCopy( base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size); } From 85ab1bc8afdf7662d4228e5083a288a1a0340201 Mon Sep 17 00:00:00 2001 From: Michael Halkenhaeuser Date: Wed, 15 Feb 2023 16:19:58 +0100 Subject: [PATCH 316/496] Add support for AMDGPU target NAVI31 / RX 7900 XT(X): gfx1100 --- Makefile.kokkos | 5 +++++ cmake/KokkosCore_config.h.in | 1 + cmake/kokkos_arch.cmake | 6 +++--- core/src/impl/Kokkos_Core.cpp | 3 +++ generate_makefile.bash | 1 + 5 files changed, 13 insertions(+), 3 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index a885a640ab..fd3a9c6ace 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1092,6 +1092,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI") KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1030 endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NAVI1100), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI1100") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100 + endif KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp) diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index c8257d8664..cb3076077c 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -117,3 +117,4 @@ #cmakedefine KOKKOS_ARCH_VEGA90A #cmakedefine KOKKOS_ARCH_NAVI #cmakedefine KOKKOS_ARCH_NAVI1030 +#cmakedefine KOKKOS_ARCH_NAVI1100 diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 5cf7fbd373..a938284295 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -93,9 +93,9 @@ IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET) ENDIF() # AMD archs ordered in decreasing priority of autodetection -LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI100 MI50/60 V620/W6800) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A VEGA908 VEGA906 NAVI1030) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx908 gfx906 gfx1030) +LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI100 MI50/60 RX7900XTX V620/W6800) +LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A VEGA908 VEGA906 NAVI1100 NAVI1030) +LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx908 gfx906 gfx1100 gfx1030) #FIXME CAN BE REPLACED WITH LIST_ZIP IN CMAKE 3.17 FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 26d45057b8..cb418df661 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -737,6 +737,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_NAVI1030) declare_configuration_metadata("architecture", "GPU architecture", "NAVI1030"); +#elif defined(KOKKOS_ARCH_NAVI1100) + declare_configuration_metadata("architecture", "GPU architecture", + "NAVI1100"); #else declare_configuration_metadata("architecture", "GPU architecture", "none"); diff --git a/generate_makefile.bash b/generate_makefile.bash index 60791d1c01..018426c9b8 100755 --- a/generate_makefile.bash +++ b/generate_makefile.bash @@ -161,6 +161,7 @@ display_help_text() { echo " VEGA908 = AMD GPU MI100 GFX908" echo " VEGA90A = AMD GPU MI200 GFX90A" echo " NAVI1030 = AMD GPU V620/W6800 GFX1030" + echo " NAVI1100 = AMD GPU RX 7900 XT(X) GFX1100" echo " [ARM]" echo " ARMV80 = ARMv8.0 Compatible CPU" echo " ARMV81 = ARMv8.1 Compatible CPU" From 488ff103b218b7c43022e0051c02ff154ced63b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 8 Mar 2023 21:28:28 +0100 Subject: [PATCH 317/496] Bring back git info to benchmarks output --- core/perf_test/Benchmark_Context.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/core/perf_test/Benchmark_Context.cpp b/core/perf_test/Benchmark_Context.cpp index a9652d1525..9aa63cc34f 100644 --- a/core/perf_test/Benchmark_Context.cpp +++ b/core/perf_test/Benchmark_Context.cpp @@ -57,9 +57,25 @@ void add_kokkos_configuration(bool verbose) { } } +void add_git_info() { + if (!Kokkos::Impl::GIT_BRANCH.empty()) { + benchmark::AddCustomContext("GIT_BRANCH", Kokkos::Impl::GIT_BRANCH); + benchmark::AddCustomContext("GIT_COMMIT_HASH", + Kokkos::Impl::GIT_COMMIT_HASH); + benchmark::AddCustomContext("GIT_CLEAN_STATUS", + Kokkos::Impl::GIT_CLEAN_STATUS); + benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION", + Kokkos::Impl::GIT_COMMIT_DESCRIPTION); + benchmark::AddCustomContext("GIT_COMMIT_DATE", + Kokkos::Impl::GIT_COMMIT_DATE); + } +} + void add_benchmark_context(bool verbose) { // Add Kokkos configuration to benchmark context data add_kokkos_configuration(verbose); + // Add git information to benchmark context data + add_git_info(); } } // namespace KokkosBenchmark From 979899369816db3e94766a94a9d1ede16288d553 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Wed, 8 Mar 2023 15:28:45 -0500 Subject: [PATCH 318/496] [ci skip] Add a comment --- algorithms/src/Kokkos_Sort.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/algorithms/src/Kokkos_Sort.hpp b/algorithms/src/Kokkos_Sort.hpp index 49fc7d5225..fcfe5d95c3 100644 --- a/algorithms/src/Kokkos_Sort.hpp +++ b/algorithms/src/Kokkos_Sort.hpp @@ -460,6 +460,8 @@ class BinSort { std::is_same_v; int lower_bound = bin_offsets(i); int upper_bound = lower_bound + bin_size; + // Switching to std::sort for more than 10 elements has been found + // reasonable experimentally. if (use_std_sort && bin_size > 10) { if constexpr (use_std_sort) { std::sort(&sort_order(lower_bound), &sort_order(upper_bound), From 72d39a7a5265c6ffc51f0dd9b113d3cf5023513b Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Wed, 8 Mar 2023 15:05:55 -0500 Subject: [PATCH 319/496] Rename ScopedProfileRegion -> ScopedRegion --- ...n.hpp => Kokkos_Profiling_ScopedRegion.hpp} | 18 +++++++++--------- core/unit_test/CMakeLists.txt | 2 +- ...dProfileRegion.cpp => TestScopedRegion.cpp} | 18 +++++++++--------- 3 files changed, 19 insertions(+), 19 deletions(-) rename core/src/{Kokkos_Profiling_ScopedProfileRegion.hpp => Kokkos_Profiling_ScopedRegion.hpp} (67%) rename core/unit_test/tools/{TestScopedProfileRegion.cpp => TestScopedRegion.cpp} (75%) diff --git a/core/src/Kokkos_Profiling_ScopedProfileRegion.hpp b/core/src/Kokkos_Profiling_ScopedRegion.hpp similarity index 67% rename from core/src/Kokkos_Profiling_ScopedProfileRegion.hpp rename to core/src/Kokkos_Profiling_ScopedRegion.hpp index 4daeeadddb..f45dfa324e 100644 --- a/core/src/Kokkos_Profiling_ScopedProfileRegion.hpp +++ b/core/src/Kokkos_Profiling_ScopedRegion.hpp @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSP_SCOPED_PROFILE_REGION_HPP -#define KOKKOSP_SCOPED_PROFILE_REGION_HPP +#ifndef KOKKOSP_SCOPED_REGION_HPP +#define KOKKOSP_SCOPED_REGION_HPP #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #define KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE_PROFILING_SCOPEDPROFILEREGION +#define KOKKOS_IMPL_PUBLIC_INCLUDE_PROFILING_SCOPEDREGION #endif #include @@ -28,24 +28,24 @@ namespace Kokkos::Profiling { -class [[nodiscard]] ScopedProfileRegion { +class [[nodiscard]] ScopedRegion { public: - ScopedProfileRegion(ScopedProfileRegion const &) = delete; - ScopedProfileRegion &operator=(ScopedProfileRegion const &) = delete; + ScopedRegion(ScopedRegion const &) = delete; + ScopedRegion &operator=(ScopedRegion const &) = delete; #if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 [[nodiscard]] #endif - explicit ScopedProfileRegion(std::string const &name) { + explicit ScopedRegion(std::string const &name) { Kokkos::Profiling::pushRegion(name); } - ~ScopedProfileRegion() { Kokkos::Profiling::popRegion(); } + ~ScopedRegion() { Kokkos::Profiling::popRegion(); } }; } // namespace Kokkos::Profiling #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE #undef KOKKOS_IMPL_PUBLIC_INCLUDE -#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_PROFILING_SCOPEDPROFILEREGION +#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_PROFILING_SCOPEDREGION #endif #endif diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 8bf3595f19..83048c9329 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -922,7 +922,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate tools/TestEventCorrectness.cpp tools/TestWithoutInitializing.cpp tools/TestProfilingSection.cpp - tools/TestScopedProfileRegion.cpp + tools/TestScopedRegion.cpp ) # FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22 diff --git a/core/unit_test/tools/TestScopedProfileRegion.cpp b/core/unit_test/tools/TestScopedRegion.cpp similarity index 75% rename from core/unit_test/tools/TestScopedProfileRegion.cpp rename to core/unit_test/tools/TestScopedRegion.cpp index 8f703da626..5306496d76 100644 --- a/core/unit_test/tools/TestScopedProfileRegion.cpp +++ b/core/unit_test/tools/TestScopedRegion.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include #include @@ -39,14 +39,14 @@ TEST(defaultdevicetype, scoped_profile_region) { { std::string outer_identifier = "outer"; - Kokkos::Profiling::ScopedProfileRegion guard_outer(outer_identifier); + Kokkos::Profiling::ScopedRegion guard_outer(outer_identifier); ASSERT_EQ(test_region_stack.size(), 1u); ASSERT_EQ(test_region_stack.top(), outer_identifier); { std::string inner_identifier = "inner"; - Kokkos::Profiling::ScopedProfileRegion guard_inner(inner_identifier); + Kokkos::Profiling::ScopedRegion guard_inner(inner_identifier); ASSERT_EQ(test_region_stack.size(), 2u); ASSERT_EQ(test_region_stack.top(), inner_identifier); } @@ -62,11 +62,11 @@ TEST(defaultdevicetype, scoped_profile_region) { Kokkos::Tools::Experimental::set_pop_region_callback(nullptr); } -using Kokkos::Profiling::ScopedProfileRegion; -static_assert(!std::is_default_constructible::value); -static_assert(!std::is_copy_constructible::value); -static_assert(!std::is_move_constructible::value); -static_assert(!std::is_copy_assignable::value); -static_assert(!std::is_move_assignable::value); +using Kokkos::Profiling::ScopedRegion; +static_assert(!std::is_default_constructible::value); +static_assert(!std::is_copy_constructible::value); +static_assert(!std::is_move_constructible::value); +static_assert(!std::is_copy_assignable::value); +static_assert(!std::is_move_assignable::value); } // namespace From b4de0ac9f581bdb606468f403f06b9e453743522 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 8 Mar 2023 16:37:19 -0500 Subject: [PATCH 320/496] Rename KOKKOS_{ -> IMPL_}ARCH_NVIDIA_GPU --- core/src/OpenACC/Kokkos_OpenACC_Traits.hpp | 2 +- .../Kokkos_OpenMPTarget_Instance.cpp | 2 +- core/src/SYCL/Kokkos_SYCL.cpp | 2 +- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 4 +-- core/src/setup/Kokkos_Setup_Cuda.hpp | 32 +++++++++---------- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp index 97d34d19a3..88140a7647 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp @@ -22,7 +22,7 @@ namespace Kokkos::Experimental::Impl { struct OpenACC_Traits { -#if defined(KOKKOS_ARCH_NVIDIA_GPU) +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) static constexpr acc_device_t dev_type = acc_device_nvidia; static constexpr bool may_fallback_to_host = false; #else diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index abe1dad73d..ff004e3a94 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -93,7 +93,7 @@ void OpenMPTargetInternal::impl_initialize() { // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures // from Pascal and upwards. -#if defined(KOKKOS_ARCH_NVIDIA_GPU) && defined(KOKKOS_COMPILER_CLANG) && \ +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) && defined(KOKKOS_COMPILER_CLANG) && \ (KOKKOS_COMPILER_CLANG >= 1300) omp_set_num_teams(512); #endif diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index 72facc856b..f8a1efda8e 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -128,7 +128,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) { // If the device id is not specified and there are no GPUs, sidestep Kokkos // device selection and use whatever is available (if no GPU architecture is // specified). -#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_NVIDIA_GPU) +#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) if (!settings.has_device_id() && gpu_devices.empty()) { Impl::SYCLInternal::singleton().initialize(sycl::device()); Impl::SYCLInternal::m_syclDev = 0; diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp index be9a384c78..34fa997e23 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -304,7 +304,7 @@ class TeamPolicyInternal return std::min({ int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. -#if defined(KOKKOS_ARCH_NVIDIA_GPU) +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) 256, #endif max_threads_for_memory @@ -334,7 +334,7 @@ class TeamPolicyInternal return std::min({ int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. -#if defined(KOKKOS_ARCH_NVIDIA_GPU) +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) 256, #endif max_threads_for_memory diff --git a/core/src/setup/Kokkos_Setup_Cuda.hpp b/core/src/setup/Kokkos_Setup_Cuda.hpp index 637855eb0d..0763b5b31a 100644 --- a/core/src/setup/Kokkos_Setup_Cuda.hpp +++ b/core/src/setup/Kokkos_Setup_Cuda.hpp @@ -70,37 +70,37 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION __device__ #if defined(KOKKOS_ARCH_KEPLER30) -#define KOKKOS_ARCH_NVIDIA_GPU 30 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 30 #elif defined(KOKKOS_ARCH_KEPLER32) -#define KOKKOS_ARCH_NVIDIA_GPU 32 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 32 #elif defined(KOKKOS_ARCH_KEPLER35) -#define KOKKOS_ARCH_NVIDIA_GPU 35 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 35 #elif defined(KOKKOS_ARCH_KEPLER37) -#define KOKKOS_ARCH_NVIDIA_GPU 37 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 37 #elif defined(KOKKOS_ARCH_MAXWELL50) -#define KOKKOS_ARCH_NVIDIA_GPU 50 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 50 #elif defined(KOKKOS_ARCH_MAXWELL52) -#define KOKKOS_ARCH_NVIDIA_GPU 52 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 52 #elif defined(KOKKOS_ARCH_MAXWELL53) -#define KOKKOS_ARCH_NVIDIA_GPU 53 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 53 #elif defined(KOKKOS_ARCH_PASCAL60) -#define KOKKOS_ARCH_NVIDIA_GPU 60 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 60 #elif defined(KOKKOS_ARCH_PASCAL61) -#define KOKKOS_ARCH_NVIDIA_GPU 61 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 61 #elif defined(KOKKOS_ARCH_VOLTA70) -#define KOKKOS_ARCH_NVIDIA_GPU 70 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 70 #elif defined(KOKKOS_ARCH_VOLTA72) -#define KOKKOS_ARCH_NVIDIA_GPU 72 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 72 #elif defined(KOKKOS_ARCH_TURING75) -#define KOKKOS_ARCH_NVIDIA_GPU 75 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 75 #elif defined(KOKKOS_ARCH_AMPERE80) -#define KOKKOS_ARCH_NVIDIA_GPU 80 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 80 #elif defined(KOKKOS_ARCH_AMPERE86) -#define KOKKOS_ARCH_NVIDIA_GPU 86 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 86 #elif defined(KOKKOS_ARCH_ADA89) -#define KOKKOS_ARCH_NVIDIA_GPU 89 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 89 #elif defined(KOKKOS_ARCH_HOPPER90) -#define KOKKOS_ARCH_NVIDIA_GPU 90 +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 90 #else #error NVIDIA GPU arch not recognized #endif From b10f35e0c9c15820be3fc2651dc465e472bd07b5 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 8 Mar 2023 16:46:57 -0500 Subject: [PATCH 321/496] Improve macro name KOKKOS_IMPL_{ARCH_NVIDIA_GPU_AMPERE_PLUS -> NVIDIA_GPU_ARCH_SUPPORT_BHALF} --- core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp index df98934ab3..59acaa6d76 100644 --- a/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp @@ -264,12 +264,12 @@ KOKKOS_INLINE_FUNCTION #if !defined(KOKKOS_ARCH_KEPLER) && !defined(KOKKOS_ARCH_MAXWELL) && \ !defined(KOKKOS_ARCH_PASCAL) && !defined(KOKKOS_ARCH_VOLTA) && \ !defined(KOKKOS_ARCH_TURING75) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU_AMPERE_PLUS +#define KOKKOS_IMPL_NVIDIA_GPU_ARCH_SUPPORT_BHALF #endif #if CUDA_VERSION >= 11000 && \ (CUDA_VERSION < 11010 || \ - !defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU_AMPERE_PLUS)) + !defined(KOKKOS_IMPL_NVIDIA_GPU_ARCH_SUPPORT_BHALF)) KOKKOS_INLINE_FUNCTION bhalf_t cast_to_bhalf(bhalf_t val) { return val; } @@ -396,7 +396,7 @@ KOKKOS_INLINE_FUNCTION } #endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010 -#if CUDA_VERSION >= 11010 && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU_AMPERE_PLUS) +#if CUDA_VERSION >= 11010 && defined(KOKKOS_IMPL_NVIDIA_GPU_ARCH_SUPPORT_BHALF) KOKKOS_INLINE_FUNCTION bhalf_t cast_to_bhalf(bhalf_t val) { return val; } KOKKOS_INLINE_FUNCTION @@ -479,7 +479,7 @@ KOKKOS_INLINE_FUNCTION } #endif // CUDA_VERSION >= 11010 -#undef KOKKOS_IMPL_ARCH_NVIDIA_GPU_AMPERE_PLUS +#undef KOKKOS_IMPL_NVIDIA_GPU_ARCH_SUPPORT_BHALF } // namespace Experimental #if (CUDA_VERSION >= 11000) From b000df58cde3b02f6573b76d18535c268b57ccc3 Mon Sep 17 00:00:00 2001 From: Josip Basic Date: Fri, 10 Mar 2023 09:02:19 +0100 Subject: [PATCH 322/496] Allow that C++20 is passed to nvcc --- cmake/kokkos_test_cxx_std.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cmake/kokkos_test_cxx_std.cmake b/cmake/kokkos_test_cxx_std.cmake index eda3124586..5f8e15cd67 100644 --- a/cmake/kokkos_test_cxx_std.cmake +++ b/cmake/kokkos_test_cxx_std.cmake @@ -29,7 +29,11 @@ FUNCTION(kokkos_set_cxx_standard_feature standard) ELSEIF(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) MESSAGE(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang)) - SET(SUPPORTED_NVCC_FLAGS "-std=c++17") + IF(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) + SET(SUPPORTED_NVCC_FLAGS "-std=c++17") + ELSE() + SET(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") + ENDIF() IF (NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) MESSAGE(FATAL_ERROR "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help.") ENDIF() From 067f74aeb73d51bd68409f948d85fb0b211f5af1 Mon Sep 17 00:00:00 2001 From: Josip Basic Date: Fri, 10 Mar 2023 09:04:57 +0100 Subject: [PATCH 323/496] Allow c++20 in nvcc_wrapper for nvcc 12 and above --- bin/nvcc_wrapper | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/bin/nvcc_wrapper b/bin/nvcc_wrapper index 2204514d1b..0c55651460 100755 --- a/bin/nvcc_wrapper +++ b/bin/nvcc_wrapper @@ -338,6 +338,24 @@ do std_flag=$corrected_std_flag shared_args="$shared_args $std_flag" ;; + --std=c++20|-std=c++20) + if [ -n "$std_flag" ]; then + warn_std_flag + shared_args=${shared_args/ $std_flag/} + fi + # NVCC only has C++20 from version 12 on + cuda_main_version=$([[ $(${nvcc_compiler} --version) =~ V([0-9]+) ]] && echo ${BASH_REMATCH[1]}) + if [ ${cuda_main_version} -lt 12 ]; then + fallback_std_flag="-std=c++14" + # this is hopefully just occurring in a downstream project during CMake feature tests + # we really have no choice here but to accept the flag and change to an accepted C++ standard + echo "nvcc_wrapper does not accept standard flags $1 since partial standard flags and standards after C++14 are not supported. nvcc_wrapper will use $fallback_std_flag instead. It is undefined behavior to use this flag. This should only be occurring during CMake configuration." + std_flag=$fallback_std_flag + else + std_flag=$1 + fi + shared_args="$shared_args $std_flag" + ;; --std=c++17|-std=c++17) if [ -n "$std_flag" ]; then warn_std_flag From 43b0245a2e80b779b49ed0893b22262a2bdbfbe3 Mon Sep 17 00:00:00 2001 From: Alex Dutka <97711898+dutkalex@users.noreply.github.com> Date: Fri, 10 Mar 2023 19:58:35 +0100 Subject: [PATCH 324/496] Print Kokkos version at configuration time (#5979) * Update CMakeLists.txt * Drop extra whitespaces and move print statement a few lines before --------- Co-authored-by: Damien L-G --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 39fa5bbe96..ac0dbe70dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -141,6 +141,7 @@ set(Kokkos_VERSION_MAJOR 4) set(Kokkos_VERSION_MINOR 0) set(Kokkos_VERSION_PATCH 99) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") +message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") # mathematical expressions below are not stricly necessary but they eliminate # the rather aggravating leading 0 in the releases patch version number, and, From ee757630162a84bfec25d74cf52163275612f820 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Fri, 10 Mar 2023 19:24:35 -0800 Subject: [PATCH 325/496] #5641: Fix HIP & CUDA MDRange reduce for sizeof(value_type) < sizeof(int) (#5745) * Fix comment typo noticed in early analysis * Add test case * #5641: HIP: Fix MDRange parallel_reduce over values smaller than int * #5641 Cuda: Fix MDRange parallel_reduce over values smaller than int * Try to appease icpc's idiocy * Skip the test for OpenMPTarget backend, since it's broken * Sample bound values to test, rather than sweeping * Shrink largest bound value to avoid timeout * Report skipped in disabled CUDA extended lambda case * Fix skipping condition --- .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 56 ++++++++++----- core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 6 +- core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp | 52 ++++++++++---- core/unit_test/CMakeLists.txt | 1 + core/unit_test/TestMDRangeReduce.hpp | 68 +++++++++++++++++++ 5 files changed, 148 insertions(+), 35 deletions(-) create mode 100644 core/unit_test/TestMDRangeReduce.hpp diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index efd5157ff0..affa9c18a8 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -212,6 +212,23 @@ class ParallelReduce 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + static_assert(sizeof(size_type) == 4); + using word_size_type = std::conditional_t< + sizeof(value_type) < 4, + std::conditional_t, size_type>; + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == // blockDim.z == 1 @@ -219,9 +236,9 @@ class ParallelReduce + const integral_nonzero_constant word_count(m_functor_reducer.get_reducer().value_size() / - sizeof(size_type)); + sizeof(word_size_type)); { reference_type value = m_functor_reducer.get_reducer().init(reinterpret_cast( - kokkos_impl_cuda_shared_memory() + + kokkos_impl_cuda_shared_memory() + threadIdx.y * word_count.value)); // Number of blocks is bounded so that the reduction can be limited to two // passes. Each thread block is given an approximately equal amount of // work to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmatically + // ordering does not match the final pass, but is arithmetically // equivalent. this->exec_range(value); @@ -272,15 +290,16 @@ class ParallelReduce( m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x, - kokkos_impl_cuda_shared_memory(), m_scratch_space, + kokkos_impl_cuda_shared_memory(), m_scratch_space, m_scratch_flags)) { // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_cuda_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = + word_size_type* const shared = + kokkos_impl_cuda_shared_memory() + + (blockDim.y - 1) * word_count.value; + word_size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) + ? reinterpret_cast(m_result_ptr) : (m_unified_space ? m_unified_space : m_scratch_space); if (threadIdx.y == 0) { @@ -342,13 +361,16 @@ class ParallelReduce(cuda_internal_scratch_space( + m_policy.space(), + m_functor_reducer.get_reducer().value_size() * + block_size /* block_size == max block_count */)); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); - m_unified_space = cuda_internal_scratch_unified( - m_policy.space(), m_functor_reducer.get_reducer().value_size()); + m_unified_space = + reinterpret_cast(cuda_internal_scratch_unified( + m_policy.space(), m_functor_reducer.get_reducer().value_size())); // REQUIRED ( 1 , N , 1 ) const dim3 block(1, block_size, 1); diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 904d1d670e..740be29677 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -203,7 +203,7 @@ class ParallelReduce, // Number of blocks is bounded so that the reduction can be limited to two // passes. Each thread block is given an approximately equal amount of // work to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmatically + // ordering does not match the final pass, but is arithmetically // equivalent. const WorkRange range(m_policy, blockIdx.x, gridDim.x); @@ -463,7 +463,7 @@ class ParallelScan, Kokkos::Cuda> { // Number of blocks is bounded so that the reduction can be limited to two // passes. Each thread block is given an approximately equal amount of work // to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmatically equivalent. + // ordering does not match the final pass, but is arithmetically equivalent. const WorkRange range(m_policy, blockIdx.x, gridDim.x); @@ -780,7 +780,7 @@ class ParallelScanWithTotal, // Number of blocks is bounded so that the reduction can be limited to two // passes. Each thread block is given an approximately equal amount of work // to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmatically equivalent. + // ordering does not match the final pass, but is arithmetically equivalent. const WorkRange range(m_policy, blockIdx.x, gridDim.x); diff --git a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp index b4423a3aca..aed177b128 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp @@ -188,6 +188,23 @@ class ParallelReduce 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + static_assert(sizeof(size_type) == 4); + using word_size_type = std::conditional_t< + sizeof(value_type) < 4, + std::conditional_t, size_type>; + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == // blockDim.z == 1 @@ -195,7 +212,7 @@ class ParallelReduce - word_count(reducer.value_size() / sizeof(size_type)); + const integral_nonzero_constant + word_count(reducer.value_size() / sizeof(word_size_type)); { reference_type value = reducer.init(reinterpret_cast( - kokkos_impl_hip_shared_memory() + + kokkos_impl_hip_shared_memory() + threadIdx.y * word_count.value)); // Number of blocks is bounded so that the reduction can be limited to two // passes. Each thread block is given an approximately equal amount of // work to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmatically + // ordering does not match the final pass, but is arithmetically // equivalent. this->exec_range(value); @@ -232,15 +250,17 @@ class ParallelReduce( reducer, blockIdx.x, gridDim.x, - kokkos_impl_hip_shared_memory(), m_scratch_space, + kokkos_impl_hip_shared_memory(), m_scratch_space, m_scratch_flags)) { // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; + word_size_type* const shared = + kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + word_size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; if (threadIdx.y == 0) { reducer.final(reinterpret_cast(shared)); @@ -294,9 +314,11 @@ class ParallelReduce(hip_internal_scratch_space( + m_policy.space(), + reducer.value_size() * + block_size /* block_size == max block_count */)); m_scratch_flags = hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 83048c9329..9a0ac17b97 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -169,6 +169,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) MDRange_f MDRange_g MDRangePolicyConstructors + MDRangeReduce MDSpan MinMaxClamp NumericTraits diff --git a/core/unit_test/TestMDRangeReduce.hpp b/core/unit_test/TestMDRangeReduce.hpp new file mode 100644 index 0000000000..007fa420c3 --- /dev/null +++ b/core/unit_test/TestMDRangeReduce.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { + +template +void MDRangeReduceTester([[maybe_unused]] int bound, int k) { + const auto policy_MD = Kokkos::MDRangePolicy, TEST_EXECSPACE>( + {0, 0}, {bound, 2}); + + // No explicit fence() calls needed because result is in HostSpace + { + T lor_MD = 0; + Kokkos::parallel_reduce( + policy_MD, + KOKKOS_LAMBDA(const int i, const int, T& res) { res = res || i == k; }, + Kokkos::LOr(lor_MD)); + EXPECT_EQ(lor_MD, 1); + } + { + // Stick just a few true values in the Logical-OR reduction space, + // to try to make sure every value is being captured + T land_MD = 0; + Kokkos::parallel_reduce( + policy_MD, KOKKOS_LAMBDA(const int, const int, T& res) { res = 1; }, + Kokkos::LAnd(land_MD)); + EXPECT_EQ(land_MD, 1); + } +} + +TEST(TEST_CATEGORY, mdrange_parallel_reduce_primitive_types) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "FIXME OPENMPTARGET Tests of MDRange reduce over values " + "smaller than int would fail"; +#elif defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA) + GTEST_SKIP() << "Skipped ENABLE_CUDA_LAMBDA"; +#else + for (int bound : {0, 1, 7, 32, 65, 7000}) { + for (int k = 0; k < bound; ++k) { + MDRangeReduceTester(bound, k); + MDRangeReduceTester(bound, k); + MDRangeReduceTester(bound, k); + MDRangeReduceTester(bound, k); + MDRangeReduceTester(bound, k); + MDRangeReduceTester(bound, k); + } + } +#endif +} + +} // namespace From bb8a96b2b0b731857f552c315494c8a7bbd0bfdd Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 13 Mar 2023 19:05:32 +0000 Subject: [PATCH 326/496] Fix sycl.large_team_scratch_size --- core/src/SYCL/Kokkos_SYCL_Team.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index b01d06b928..674037ed95 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -337,10 +337,11 @@ class SYCLTeamMember { // Private for the driver KOKKOS_INLINE_FUNCTION - SYCLTeamMember(sycl::local_ptr shared, const int shared_begin, - const int shared_size, + SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, + const std::size_t shared_size, sycl::device_ptr scratch_level_1_ptr, - const int scratch_level_1_size, const sycl::nd_item<2> item) + const std::size_t scratch_level_1_size, + const sycl::nd_item<2> item) : m_team_reduce(shared), m_team_shared(static_cast>(shared) + shared_begin, shared_size, scratch_level_1_ptr, scratch_level_1_size), From 22cc433124426057a6ce85f63ec20169d84546e3 Mon Sep 17 00:00:00 2001 From: tcclevenger Date: Mon, 13 Mar 2023 16:36:40 -0600 Subject: [PATCH 327/496] Add to HIP tests in Makefile --- core/unit_test/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/core/unit_test/Makefile b/core/unit_test/Makefile index beafeccb78..33a84b61f9 100644 --- a/core/unit_test/Makefile +++ b/core/unit_test/Makefile @@ -290,6 +290,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) OBJ_HIP += TestHIP_MDRange_a.o TestHIP_MDRange_b.o TestHIP_MDRange_c.o TestHIP_MDRange_d.o TestHIP_MDRange_e.o OBJ_HIP += TestHIP_Spaces.o OBJ_HIP += TestHIP_Memory_Requirements.o + OBJ_HIP += TestHIP_ParallelScanRangePolicy.o OBJ_HIP += TestHIPHostPinned_ViewAPI_a.o TestHIPHostPinned_ViewAPI_b.o TestHIPHostPinned_ViewAPI_c.o TestHIPHostPinned_ViewAPI_d.o TestHIPHostPinned_ViewAPI_e.o OBJ_HIP += TestHIPHostPinned_ViewCopy_a.o TestHIPHostPinned_ViewCopy_b.o OBJ_HIP += TestHIPHostPinned_ViewMapping_a.o TestHIPHostPinned_ViewMapping_b.o TestHIPHostPinned_ViewMapping_subview.o From 42991f1047a7a6832d4c1b6aebc899bf4313d427 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 14 Mar 2023 08:02:25 -0700 Subject: [PATCH 328/496] Bit manipulation: implement `byteswap` (#5967) * Implement byteswap * Fix warnings * Fix warning for real this time * Alternate implementation for byteswap_fallback (more readable?) Co-authored-by: Daniel Arndt * Disable one test case for CUDA_VERSION < 11.3.0 * Try something else * Workaround for the nvcc bug comparing int64_t * Changed use of literal suffixes to use ?INT*_C macros for fixed size types * Workaround for nvcc compiler bug * Refactor so the workaround to avoid need for bogus NVCC 11.2 warning workaround Co-Authored-By: Nicolas Morales --------- Co-authored-by: Daniel Arndt Co-authored-by: Christian Trott Co-authored-by: Nevin Liber Co-authored-by: Nicolas Morales --- core/src/Kokkos_BitManipulation.hpp | 76 +++++++++++++++++++ core/unit_test/TestBitManipulation.cpp | 54 +++++++++++++ .../unit_test/TestBitManipulationBuiltins.hpp | 71 +++++++++++++++++ 3 files changed, 201 insertions(+) diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index b54be3b301..caf2e93d51 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -19,9 +19,41 @@ #include #include +#include // CHAR_BIT +#include namespace Kokkos::Impl { +template +KOKKOS_FUNCTION constexpr T byteswap_fallback(T x) { + if constexpr (sizeof(T) > 1) { + using U = std::make_unsigned_t; + + size_t shift = CHAR_BIT * (sizeof(T) - 1); + + U lo_mask = static_cast(~0); + U hi_mask = lo_mask << shift; + + U val = x; + + for (size_t i = 0; i < sizeof(T) / 2; ++i) { + U lo_val = val & lo_mask; + U hi_val = val & hi_mask; + + val = (val & ~lo_mask) | (hi_val >> shift); + val = (val & ~hi_mask) | (lo_val << shift); + + lo_mask <<= CHAR_BIT; + hi_mask >>= CHAR_BIT; + + shift -= 2 * CHAR_BIT; + } + return val; + } + // sizeof(T) == 1 + return x; +} + template KOKKOS_FUNCTION constexpr int countl_zero_fallback(T x) { // From Hacker's Delight (2nd edition) section 5-3 @@ -66,6 +98,14 @@ inline constexpr bool is_standard_unsigned_integer_type_v = namespace Kokkos { +// +template +KOKKOS_FUNCTION constexpr std::enable_if_t, T> byteswap( + T value) noexcept { + return Impl::byteswap_fallback(value); +} +// + // template KOKKOS_FUNCTION constexpr std::enable_if_t< @@ -187,6 +227,35 @@ namespace Kokkos::Impl { #define KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS #endif +template +KOKKOS_IMPL_DEVICE_FUNCTION T byteswap_builtin_device(T x) noexcept { + return byteswap_fallback(x); +} + +template +KOKKOS_IMPL_HOST_FUNCTION T byteswap_builtin_host(T x) noexcept { +#ifdef KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS + if constexpr (sizeof(T) == 1) { + return x; + } else if constexpr (sizeof(T) == 2) { + return __builtin_bswap16(x); + } else if constexpr (sizeof(T) == 4) { + return __builtin_bswap32(x); + } else if constexpr (sizeof(T) == 8) { + return __builtin_bswap64(x); + } else if constexpr (sizeof(T) == 16) { +#if __has_builtin(__builtin_bswap128) + return __builtin_bswap128(x); +#else + return (__builtin_bswap64(x >> 64) | + (static_cast(__builtin_bswap64(x)) << 64)); +#endif + } +#endif + + return byteswap_fallback(x); +} + template KOKKOS_IMPL_DEVICE_FUNCTION std::enable_if_t, int> @@ -308,6 +377,13 @@ KOKKOS_IMPL_HOST_FUNCTION namespace Kokkos::Experimental { +template +KOKKOS_FUNCTION std::enable_if_t, T> byteswap_builtin( + T x) noexcept { + KOKKOS_IF_ON_DEVICE((return ::Kokkos::Impl::byteswap_builtin_device(x);)) + KOKKOS_IF_ON_HOST((return ::Kokkos::Impl::byteswap_builtin_host(x);)) +} + template KOKKOS_FUNCTION std::enable_if_t< ::Kokkos::Impl::is_standard_unsigned_integer_type_v, int> diff --git a/core/unit_test/TestBitManipulation.cpp b/core/unit_test/TestBitManipulation.cpp index 13987ecf66..407596c2bb 100644 --- a/core/unit_test/TestBitManipulation.cpp +++ b/core/unit_test/TestBitManipulation.cpp @@ -429,4 +429,58 @@ constexpr auto test_bit_width(UInt x) -> decltype(Kokkos::bit_width(x)) { TEST_BIT_MANIPULATION(bit_width); // +// +template +constexpr auto test_byteswap(T x) -> decltype(Kokkos::byteswap(x)) { + using Kokkos::byteswap; + + static_assert(noexcept(byteswap(x))); + static_assert(std::is_same_v); + + return true; +} + +constexpr X test_byteswap(...) { return {}; } + +static_assert(test_byteswap((void*)0).did_not_match()); // NOLINT +static_assert(test_byteswap((float)0).did_not_match()); +constexpr char c2[2] = {}; +static_assert(test_byteswap(c2).did_not_match()); +static_assert(test_byteswap((char)0)); +static_assert(test_byteswap((short)0)); +static_assert(test_byteswap((int)0)); +static_assert(test_byteswap((long)0)); +static_assert(test_byteswap((long long)0)); +static_assert(test_byteswap((unsigned char)0)); +static_assert(test_byteswap((unsigned short)0)); +static_assert(test_byteswap((unsigned int)0)); +static_assert(test_byteswap((unsigned long)0)); +static_assert(test_byteswap((unsigned long long)0)); + +constexpr bool test_byteswap2() { + using Kokkos::byteswap; + + static_assert(byteswap(INT8_C(0x12)) == INT8_C(0x12)); + static_assert(byteswap(INT16_C(0x1234)) == INT16_C(0x3412)); + static_assert(byteswap(INT32_C(0x12345678)) == INT32_C(0x78563412)); + + // These static_casts are a workaround for an nvcc 11.2 compiler bug + static_assert( + static_cast(byteswap(INT64_C(0x123456789abcdef0))) == + static_cast(INT64_C(0xf0debc9a78563412))); + + static_assert(byteswap(UINT8_C(0x21)) == UINT8_C(0x21)); + static_assert(byteswap(UINT16_C(0x4321)) == UINT16_C(0x2143)); + static_assert(byteswap(UINT32_C(0x87654321)) == + UINT32_C(0x21436587)); + static_assert(byteswap(UINT64_C(0xfedcba9876543210)) == + UINT64_C(0x1032547698badcfe)); + static_assert(byteswap(UINT32_C(0xdeadbeef)) == + UINT32_C(0xefbeadde)); + + return true; +} +static_assert(test_byteswap2()); +// + #undef TEST_BIT_MANIPULATION diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp index 9ab4e6e15f..bc2122e0ab 100644 --- a/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/core/unit_test/TestBitManipulationBuiltins.hpp @@ -28,6 +28,11 @@ DEFINE_TYPE_NAME(unsigned short) DEFINE_TYPE_NAME(unsigned int) DEFINE_TYPE_NAME(unsigned long) DEFINE_TYPE_NAME(unsigned long long) +DEFINE_TYPE_NAME(char) +DEFINE_TYPE_NAME(short) +DEFINE_TYPE_NAME(int) +DEFINE_TYPE_NAME(long) +DEFINE_TYPE_NAME(long long) #undef DEFINE_TYPE_NAME // clang-format on @@ -598,3 +603,69 @@ TEST(TEST_CATEGORY, bit_manip_rotr) { } #undef TEST_BIT_ROTATE_FUNCTION + +template +struct TestByteswapFunction { + TestByteswapFunction() { run(); } + void run() const { + int errors = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, 1), *this, errors); + ASSERT_EQ(errors, 0) << "Failed check no error for byteswap(" + << type_helper::name() << ")"; + } + KOKKOS_FUNCTION void operator()(int, int& e) const { + T value; + T expected; + switch (sizeof(T)) { + case 1: + value = static_cast(0x12); + expected = static_cast(0x12); + break; + case 2: + value = static_cast(0x1234); + expected = static_cast(0x3412); + break; + case 4: + value = static_cast(0x60AF8503); + expected = static_cast(0x0385AF60); + break; + case 8: + value = static_cast(0xABCDFE9477936406); + expected = static_cast(0x0664937794FECDAB); + break; + default: Kokkos::abort("logic error"); + } + using Kokkos::Experimental::byteswap_builtin; + if (byteswap_builtin(value) != expected) { + ++e; + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "value at %llx which is %llx was expected to be %llx\n", + (unsigned long long)value, + (unsigned long long)byteswap_builtin(value), + (unsigned long long)expected); + } + } +}; + +template +void test_bit_manip_byteswap() { + using Kokkos::rotr; + using Kokkos::Experimental::byteswap_builtin; + static_assert(noexcept(byteswap_builtin(Integral()))); + static_assert( + std::is_same_v); + TestByteswapFunction(); +} + +TEST(TEST_CATEGORY, bit_manip_byeswap) { + test_bit_manip_byteswap(); + test_bit_manip_byteswap(); + test_bit_manip_byteswap(); + test_bit_manip_byteswap(); + test_bit_manip_byteswap(); + test_bit_manip_byteswap(); + test_bit_manip_byteswap(); + test_bit_manip_byteswap(); + test_bit_manip_byteswap(); + test_bit_manip_byteswap(); +} From 74e2fe90cf136029ac11bd68e7acb164e3b951cf Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 21 Feb 2023 14:32:57 -0700 Subject: [PATCH 329/496] UnorderedMap: Ensure size() working in case of copies --- containers/src/Kokkos_UnorderedMap.hpp | 12 ++++++------ containers/unit_tests/TestUnorderedMap.hpp | 5 +++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/containers/src/Kokkos_UnorderedMap.hpp b/containers/src/Kokkos_UnorderedMap.hpp index 058b6626c4..3b9503538a 100644 --- a/containers/src/Kokkos_UnorderedMap.hpp +++ b/containers/src/Kokkos_UnorderedMap.hpp @@ -275,7 +275,7 @@ class UnorderedMap { : m_bounded_insert(true), m_hasher(hasher), m_equal_to(equal_to), - m_size(), + m_size("m_size"), m_available_indexes(calculate_capacity(capacity_hint)), m_hash_lists(view_alloc(WithoutInitializing, "UnorderedMap hash list"), Impl::find_hash_size(capacity())), @@ -315,7 +315,7 @@ class UnorderedMap { Kokkos::deep_copy(m_keys, tmp); } Kokkos::deep_copy(m_scalars, 0); - m_size = 0; + m_size() = 0; } KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { @@ -369,10 +369,10 @@ class UnorderedMap { size_type size() const { if (capacity() == 0u) return 0u; if (modified()) { - m_size = m_available_indexes.count(); + m_size() = m_available_indexes.count(); reset_flag(modified_idx); } - return m_size; + return m_size(); } /// \brief The current number of failed insert() calls. @@ -725,7 +725,7 @@ class UnorderedMap { tmp.m_bounded_insert = src.m_bounded_insert; tmp.m_hasher = src.m_hasher; tmp.m_equal_to = src.m_equal_to; - tmp.m_size = src.size(); + tmp.m_size = src.m_size; tmp.m_available_indexes = bitset_type(src.capacity()); tmp.m_hash_lists = size_type_view( view_alloc(WithoutInitializing, "UnorderedMap hash list"), @@ -818,7 +818,7 @@ class UnorderedMap { bool m_bounded_insert; hasher_type m_hasher; equal_to_type m_equal_to; - mutable size_type m_size; + mutable Kokkos::View m_size; bitset_type m_available_indexes; size_type_view m_hash_lists; size_type_view m_next_index; diff --git a/containers/unit_tests/TestUnorderedMap.hpp b/containers/unit_tests/TestUnorderedMap.hpp index 989779b53a..d077cfefa6 100644 --- a/containers/unit_tests/TestUnorderedMap.hpp +++ b/containers/unit_tests/TestUnorderedMap.hpp @@ -54,7 +54,12 @@ struct TestInsert { } } while (rehash_on_fail && failed_count > 0u); + // Trigger the m_size mutable bug. + typename map_type::HostMirror map_h; execution_space().fence(); + Kokkos::deep_copy(map_h, map); + execution_space().fence(); + ASSERT_EQ(map_h.size(), map.size()); } KOKKOS_INLINE_FUNCTION From 6a8e923e5c6fbe301b062818dd9516bed4db79e5 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 14 Mar 2023 18:23:58 -0400 Subject: [PATCH 330/496] Use (non-mutable) std::shared_ptr instead --- containers/src/Kokkos_UnorderedMap.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/containers/src/Kokkos_UnorderedMap.hpp b/containers/src/Kokkos_UnorderedMap.hpp index 3b9503538a..7a89b189e8 100644 --- a/containers/src/Kokkos_UnorderedMap.hpp +++ b/containers/src/Kokkos_UnorderedMap.hpp @@ -275,7 +275,7 @@ class UnorderedMap { : m_bounded_insert(true), m_hasher(hasher), m_equal_to(equal_to), - m_size("m_size"), + m_size(std::make_shared()), m_available_indexes(calculate_capacity(capacity_hint)), m_hash_lists(view_alloc(WithoutInitializing, "UnorderedMap hash list"), Impl::find_hash_size(capacity())), @@ -315,7 +315,7 @@ class UnorderedMap { Kokkos::deep_copy(m_keys, tmp); } Kokkos::deep_copy(m_scalars, 0); - m_size() = 0; + *m_size = 0; } KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { @@ -369,10 +369,10 @@ class UnorderedMap { size_type size() const { if (capacity() == 0u) return 0u; if (modified()) { - m_size() = m_available_indexes.count(); + *m_size = m_available_indexes.count(); reset_flag(modified_idx); } - return m_size(); + return *m_size; } /// \brief The current number of failed insert() calls. @@ -818,7 +818,7 @@ class UnorderedMap { bool m_bounded_insert; hasher_type m_hasher; equal_to_type m_equal_to; - mutable Kokkos::View m_size; + std::shared_ptr m_size; bitset_type m_available_indexes; size_type_view m_hash_lists; size_type_view m_next_index; From 3cb200c11c3db010887f99d5354ac44c67d77e71 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 15 Mar 2023 08:05:45 -0400 Subject: [PATCH 331/496] Add another test case --- containers/unit_tests/TestUnorderedMap.hpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/containers/unit_tests/TestUnorderedMap.hpp b/containers/unit_tests/TestUnorderedMap.hpp index d077cfefa6..bcaf6d77c4 100644 --- a/containers/unit_tests/TestUnorderedMap.hpp +++ b/containers/unit_tests/TestUnorderedMap.hpp @@ -333,6 +333,25 @@ TEST(TEST_CATEGORY, UnorderedMap_clear_zero_size) { ASSERT_EQ(0u, m.size()); } +TEST(TEST_CATEGORY, UnorderedMap_consistent_size) { + using Map = + Kokkos::UnorderedMap; + + Map m(11); + m.insert(7); + ; + ASSERT_EQ(1u, m.size()); + + { + auto m2 = m; + m2.insert(2); + // This line triggers modified flags to be cleared in both m and m2 + [[maybe_unused]] auto sz = m2.size(); + } + + ASSERT_EQ(2u, m.size()); +} + } // namespace Test #endif // KOKKOS_TEST_UNORDERED_MAP_HPP From bb5ef8fdc3a4b8d53e4b09370a53b907bf2bcd3e Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Wed, 15 Mar 2023 15:06:47 +0100 Subject: [PATCH 332/496] graph(hip): enable test --- core/unit_test/CMakeLists.txt | 6 ++++++ .../category_files/TestHIP_Category.hpp | 1 + core/unit_test/hip/TestHIP_Graph.cpp | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+) create mode 100644 core/unit_test/hip/TestHIP_Graph.cpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 9ec5a5bb58..ef928003bc 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -729,6 +729,12 @@ if(Kokkos_ENABLE_HIP) UnitTestMain.cpp hip/TestHIP_InterOp_Streams.cpp ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_HIPGraph + SOURCES + UnitTestMainInit.cpp + hip/TestHIP_Graph.cpp + ) endif() if(Kokkos_ENABLE_SYCL) diff --git a/core/unit_test/category_files/TestHIP_Category.hpp b/core/unit_test/category_files/TestHIP_Category.hpp index adaed3281a..6086f7e05b 100644 --- a/core/unit_test/category_files/TestHIP_Category.hpp +++ b/core/unit_test/category_files/TestHIP_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 6 #define TEST_CATEGORY_DEATH hip_DeathTest #define TEST_EXECSPACE Kokkos::HIP +#define TEST_CATEGORY_FIXTURE(name) hip_##name #endif diff --git a/core/unit_test/hip/TestHIP_Graph.cpp b/core/unit_test/hip/TestHIP_Graph.cpp new file mode 100644 index 0000000000..405cb76c64 --- /dev/null +++ b/core/unit_test/hip/TestHIP_Graph.cpp @@ -0,0 +1,18 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include From be14872fe3bcf0ceb26c7486d1afccb03748b240 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 29 Sep 2022 14:28:49 +0000 Subject: [PATCH 333/496] Remove workaround for submit_barrier not being enqueued properly --- core/src/SYCL/Kokkos_SYCL_Space.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Space.cpp b/core/src/SYCL/Kokkos_SYCL_Space.cpp index 50ee3a3e11..62af720b7c 100644 --- a/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -39,12 +39,8 @@ void DeepCopySYCL(void* dst, const void* src, size_t n) { void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, const void* src, size_t n) { - // FIXME_SYCL memcpy doesn't respect submit_barrier which means that we need - // to actually fence the execution space to make sure the memcpy is properly - // enqueued when using out-of-order queues. sycl::queue& q = *instance.impl_internal_space_instance()->m_queue; - q.wait_and_throw(); - auto event = q.memcpy(dst, src, n); + auto event = q.memcpy(dst, src, n); q.ext_oneapi_submit_barrier(std::vector{event}); } From 4fde4b03512f90d6d0f82a1bf79426c5364714f0 Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Thu, 16 Mar 2023 16:47:58 -0500 Subject: [PATCH 334/496] Support --compiler-options in nvcc_wrapper According to 'nvcc --help', it is just an alias to -Xcompiler. nvcc --help ... --compiler-options ,... (-Xcompiler) Specify options directly to the compiler/preprocessor. --- bin/nvcc_wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/nvcc_wrapper b/bin/nvcc_wrapper index 0c55651460..1397148141 100755 --- a/bin/nvcc_wrapper +++ b/bin/nvcc_wrapper @@ -407,7 +407,7 @@ do -Woverloaded-virtual) ;; #strip -Xcompiler because we add it - -Xcompiler) + -Xcompiler|--compiler-options) if [[ $2 != "-o" ]]; then if [ $first_xcompiler_arg -eq 1 ]; then xcompiler_args="$2" From d5244e1b32b5bc11c79c679f653da4f10fe823ed Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 17 Mar 2023 13:17:55 -0400 Subject: [PATCH 335/496] Cleanup OpenMPTaget ParallelReduce --- ...kkos_OpenMPTarget_ParallelReduce_Range.hpp | 8 +-- ...okkos_OpenMPTarget_ParallelReduce_Team.hpp | 8 +-- ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 2 +- .../Kokkos_OpenMPTarget_Parallel_Common.hpp | 66 +++++-------------- 4 files changed, 26 insertions(+), 58 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index e12240208e..dbdb2826c9 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -38,12 +38,12 @@ class ParallelReduce, using pointer_type = typename ReducerType::pointer_type; using reference_type = typename ReducerType::reference_type; - static constexpr int FunctorHasJoin = + static constexpr bool FunctorHasJoin = Impl::FunctorAnalysis::has_join_member_function; - static constexpr int UseReducer = + FunctorType>::Reducer::has_join_member_function(); + static constexpr bool UseReducer = !std::is_same_v; - static constexpr int IsArray = std::is_pointer::value; + static constexpr bool IsArray = std::is_pointer_v; using ParReduceSpecialize = ParallelReduceSpecialize::has_join_member_function; - static constexpr int UseReducer = + FunctorType>::Reducer::has_join_member_function(); + static constexpr bool UseReducer = !std::is_same_v; - static constexpr int IsArray = std::is_pointer::value; + static constexpr bool IsArray = std::is_pointer_v; using ParReduceSpecialize = ParallelReduceSpecialize, local_offset_value = element_values(team_id, i - 1); // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs #ifdef KOKKOS_ARCH_VEGA - if constexpr (Analysis::has_join_member_function) { + if constexpr (Analysis::Reducer::has_join_member_function()) { if constexpr (std::is_void_v) a_functor.join(local_offset_value, offset_value); else diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp index 49e7b33264..2ce25f9ffd 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp @@ -51,8 +51,8 @@ struct ParallelReduceSpecialize { PointerType /*result_ptr*/) { constexpr int FunctorHasJoin = Impl::FunctorAnalysis::has_join_member_function; - constexpr int UseReducerType = is_reducer::value; + FunctorType>::Reducer::has_join_member_function(); + constexpr int UseReducerType = is_reducer_v; std::stringstream error_message; error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' ' @@ -198,7 +198,6 @@ struct ParallelReduceSpecialize, using FunctorAnalysis = Impl::FunctorAnalysis; - constexpr int HasInit = FunctorAnalysis::has_init_member_function; // Initialize the result pointer. @@ -220,27 +219,16 @@ struct ParallelReduceSpecialize, ValueType* scratch_ptr = static_cast(OpenMPTargetExec::get_scratch_ptr()); -#pragma omp target map(to : f) is_device_ptr(scratch_ptr) - { - typename FunctorAnalysis::Reducer final_reducer(f); - // Enter this loop if the functor has an `init` - if constexpr (HasInit) { - // The `init` routine needs to be called on the device since it might - // need device members. - final_reducer.init(scratch_ptr); - final_reducer.final(scratch_ptr); - } else { - for (int i = 0; i < value_count; ++i) { - static_cast(scratch_ptr)[i] = ValueType(); - } + typename FunctorAnalysis::Reducer final_reducer(f); + if (end <= begin) { +#pragma omp target map(to : final_reducer) is_device_ptr(scratch_ptr) + { + // If there is no work to be done, copy back the initialized values and + // exit. + final_reducer.init(scratch_ptr); final_reducer.final(scratch_ptr); } - } - - if (end <= begin) { - // If there is no work to be done, copy back the initialized values and - // exit. if (!ptr_on_device) KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, @@ -255,9 +243,8 @@ struct ParallelReduceSpecialize, #pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \ map(to \ - : f) is_device_ptr(scratch_ptr) + : final_reducer) is_device_ptr(scratch_ptr) { - typename FunctorAnalysis::Reducer final_reducer(f); #pragma omp parallel { const int team_num = omp_get_team_num(); @@ -304,7 +291,6 @@ struct ParallelReduceSpecialize, is_device_ptr(scratch_ptr) for (int i = 0; i < max_teams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { - typename FunctorAnalysis::Reducer final_reducer(f); ValueType* team_scratch = scratch_ptr; const int team_offset = max_team_threads * value_count; final_reducer.join( @@ -538,7 +524,6 @@ struct ParallelReduceSpecialize, using FunctorAnalysis = Impl::FunctorAnalysis; - constexpr int HasInit = FunctorAnalysis::has_init_member_function; const int league_size = p.league_size(); const int team_size = p.team_size(); @@ -568,32 +553,17 @@ struct ParallelReduceSpecialize, OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), league_size); void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + typename FunctorAnalysis::Reducer final_reducer(f); - // Enter this loop if the functor has an `init` - if constexpr (HasInit) { - // The `init` routine needs to be called on the device since it might need - // device members. -#pragma omp target map(to : f) is_device_ptr(scratch_ptr) + if (end <= begin) { +// If there is no work to be done, copy back the initialized values and +// exit. +#pragma omp target map(to : final_reducer) is_device_ptr(scratch_ptr) { - typename FunctorAnalysis::Reducer final_reducer(f); final_reducer.init(scratch_ptr); final_reducer.final(scratch_ptr); } - } else { -#pragma omp target map(to : f) is_device_ptr(scratch_ptr) - { - for (int i = 0; i < value_count; ++i) { - static_cast(scratch_ptr)[i] = ValueType(); - } - - typename FunctorAnalysis::Reducer final_reducer(f); - final_reducer.final(static_cast(scratch_ptr)); - } - } - if (end <= begin) { - // If there is no work to be done, copy back the initialized values and - // exit. if (!ptr_on_device) KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, @@ -616,7 +586,6 @@ struct ParallelReduceSpecialize, const int num_teams = omp_get_num_teams(); ValueType* team_scratch = static_cast(scratch_ptr) + team_num * team_size * value_count; - typename FunctorAnalysis::Reducer final_reducer(f); ReferenceType result = final_reducer.init(&team_scratch[0]); for (int league_id = team_num; league_id < league_size; @@ -635,14 +604,13 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { -#pragma omp target teams distribute parallel for simd map(to \ - : f) \ +#pragma omp target teams distribute parallel for simd map(to \ + : final_reducer) \ is_device_ptr(scratch_ptr) for (int i = 0; i < nteams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { ValueType* team_scratch = static_cast(scratch_ptr); const int team_offset = team_size * value_count; - typename FunctorAnalysis::Reducer final_reducer(f); final_reducer.join( &team_scratch[i * team_offset], &team_scratch[(i + tree_neighbor_offset) * team_offset]); From 48640d72660751354bbe8cba593a8d23e00e8713 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 17 Mar 2023 14:45:29 -0400 Subject: [PATCH 336/496] Fix compiling OpenMPTarget for AMD GPUs --- .../OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index ee8ed3fddb..dd5aa0878e 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -145,9 +145,11 @@ class ParallelScan, #ifdef KOKKOS_ARCH_VEGA if constexpr (Analysis::Reducer::has_join_member_function()) { if constexpr (std::is_void_v) - a_functor.join(local_offset_value, offset_value); + a_functor_reducer.get_functor().join(local_offset_value, + offset_value); else - a_functor.join(WorkTag{}, local_offset_value, offset_value); + a_functor_reducer.get_functor().join( + WorkTag{}, local_offset_value, offset_value); } else local_offset_value += offset_value; #else From 762e3ce32fa4ebb993fb704a43ec7dcbb3c3c43b Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 17 Mar 2023 17:08:15 -0600 Subject: [PATCH 337/496] Desul atomics: Fix NVCC warning integer conversion resulted in a change of sign --- tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp b/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp index 5c662bfc58..69ed8bcb9f 100644 --- a/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp +++ b/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp @@ -63,7 +63,7 @@ inline __device__ unsigned long long device_atomic_fetch_inc(unsigned long long* inline __device__ int device_atomic_fetch_dec( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1 ); } inline __device__ unsigned int device_atomic_fetch_dec( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_dec(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -1 ); } +inline __device__ unsigned long long device_atomic_fetch_dec(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -1ull);} inline __device__ unsigned int device_atomic_fetch_inc_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicInc(ptr, val); } inline __device__ unsigned int device_atomic_fetch_dec_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicDec(ptr, val); } From 106a4a3633937f854e5304df55ebc950a0c57617 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 17 Mar 2023 17:53:09 -0600 Subject: [PATCH 338/496] Fixup NVIDIA GPU arch must be defined potentially for other backends as well --- .../Kokkos_Cuda_NvidiaGpuArchitectures.hpp | 58 +++++++++++++++++++ core/src/Kokkos_Macros.hpp | 1 + core/src/setup/Kokkos_Setup_Cuda.hpp | 36 ------------ 3 files changed, 59 insertions(+), 36 deletions(-) create mode 100644 core/src/Cuda/Kokkos_Cuda_NvidiaGpuArchitectures.hpp diff --git a/core/src/Cuda/Kokkos_Cuda_NvidiaGpuArchitectures.hpp b/core/src/Cuda/Kokkos_Cuda_NvidiaGpuArchitectures.hpp new file mode 100644 index 0000000000..956b6dffea --- /dev/null +++ b/core/src/Cuda/Kokkos_Cuda_NvidiaGpuArchitectures.hpp @@ -0,0 +1,58 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_CUDA_NVIDIA_GPU_ARCHITECTURES_HPP +#define KOKKOS_CUDA_NVIDIA_GPU_ARCHITECTURES_HPP + +#if defined(KOKKOS_ARCH_KEPLER30) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 30 +#elif defined(KOKKOS_ARCH_KEPLER32) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 32 +#elif defined(KOKKOS_ARCH_KEPLER35) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 35 +#elif defined(KOKKOS_ARCH_KEPLER37) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 37 +#elif defined(KOKKOS_ARCH_MAXWELL50) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 50 +#elif defined(KOKKOS_ARCH_MAXWELL52) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 52 +#elif defined(KOKKOS_ARCH_MAXWELL53) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 53 +#elif defined(KOKKOS_ARCH_PASCAL60) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 60 +#elif defined(KOKKOS_ARCH_PASCAL61) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 61 +#elif defined(KOKKOS_ARCH_VOLTA70) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 70 +#elif defined(KOKKOS_ARCH_VOLTA72) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 72 +#elif defined(KOKKOS_ARCH_TURING75) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 75 +#elif defined(KOKKOS_ARCH_AMPERE80) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 80 +#elif defined(KOKKOS_ARCH_AMPERE86) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 86 +#elif defined(KOKKOS_ARCH_ADA89) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 89 +#elif defined(KOKKOS_ARCH_HOPPER90) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 90 +#elif defined(KOKKOS_ENABLE_CUDA) +// do not raise an error on other backends that may run on NVIDIA GPUs such as +// OpenACC, OpenMPTarget, or SYCL +#error NVIDIA GPU arch not recognized +#endif + +#endif diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index a884c037b3..c672de24b0 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -55,6 +55,7 @@ #ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H #include +#include #endif //---------------------------------------------------------------------------- diff --git a/core/src/setup/Kokkos_Setup_Cuda.hpp b/core/src/setup/Kokkos_Setup_Cuda.hpp index 0763b5b31a..c57f690ae1 100644 --- a/core/src/setup/Kokkos_Setup_Cuda.hpp +++ b/core/src/setup/Kokkos_Setup_Cuda.hpp @@ -69,40 +69,4 @@ #define KOKKOS_IMPL_HOST_FUNCTION __host__ #define KOKKOS_IMPL_DEVICE_FUNCTION __device__ -#if defined(KOKKOS_ARCH_KEPLER30) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 30 -#elif defined(KOKKOS_ARCH_KEPLER32) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 32 -#elif defined(KOKKOS_ARCH_KEPLER35) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 35 -#elif defined(KOKKOS_ARCH_KEPLER37) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 37 -#elif defined(KOKKOS_ARCH_MAXWELL50) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 50 -#elif defined(KOKKOS_ARCH_MAXWELL52) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 52 -#elif defined(KOKKOS_ARCH_MAXWELL53) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 53 -#elif defined(KOKKOS_ARCH_PASCAL60) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 60 -#elif defined(KOKKOS_ARCH_PASCAL61) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 61 -#elif defined(KOKKOS_ARCH_VOLTA70) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 70 -#elif defined(KOKKOS_ARCH_VOLTA72) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 72 -#elif defined(KOKKOS_ARCH_TURING75) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 75 -#elif defined(KOKKOS_ARCH_AMPERE80) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 80 -#elif defined(KOKKOS_ARCH_AMPERE86) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 86 -#elif defined(KOKKOS_ARCH_ADA89) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 89 -#elif defined(KOKKOS_ARCH_HOPPER90) -#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 90 -#else -#error NVIDIA GPU arch not recognized -#endif - #endif /* KOKKOS_CUDA_SETUP_HPP_ */ From 19a43a64763afc45baeb64bbc3b81d50dc892e61 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 20 Mar 2023 14:38:50 -0600 Subject: [PATCH 339/496] Fix warning with NVC++ ``` "/include/desul/atomics/Common.hpp", line 86: warning: integer conversion resulted in a change of sign [integer_sign_change] static constexpr uint32_t value = -1; ^ ``` --- tpls/desul/include/desul/atomics/Common.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Common.hpp b/tpls/desul/include/desul/atomics/Common.hpp index fb36ac3566..b7353e7dba 100644 --- a/tpls/desul/include/desul/atomics/Common.hpp +++ b/tpls/desul/include/desul/atomics/Common.hpp @@ -83,11 +83,11 @@ struct numeric_limits_max; template <> struct numeric_limits_max { - static constexpr uint32_t value = -1; + static constexpr auto value = static_cast(-1); }; template <> struct numeric_limits_max { - static constexpr uint64_t value = -1; + static constexpr auto value = static_cast(-1); }; constexpr bool atomic_always_lock_free(std::size_t size) { From 33a5d6065965b51451222cb106e1411b6ba1d2b3 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 21 Mar 2023 18:11:25 +0000 Subject: [PATCH 340/496] Fix team_scratch_1_queues for SYCL+Cuda --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 56 +++++++++++++------ core/src/SYCL/Kokkos_SYCL_Instance.hpp | 13 ++++- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 18 ++++-- .../sycl/TestSYCL_TeamScratchStreams.cpp | 3 - 4 files changed, 61 insertions(+), 29 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 6a0e3b4934..c361d5727f 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -134,28 +134,45 @@ void SYCLInternal::initialize(const sycl::queue& q) { desul::Impl::init_lock_arrays_sycl(*m_queue); } #endif +} + +int SYCLInternal::acquire_team_scratch_space() { + int current_team_scratch = desul::atomic_fetch_inc_mod( + &m_current_team_scratch, m_n_team_scratch - 1, + desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); - m_team_scratch_current_size = 0; - m_team_scratch_ptr = nullptr; + m_team_scratch_pool[current_team_scratch].wait_and_throw(); + + return current_team_scratch; } sycl::device_ptr SYCLInternal::resize_team_scratch_space( - std::int64_t bytes, bool force_shrink) { - if (m_team_scratch_current_size == 0) { - m_team_scratch_current_size = bytes; - m_team_scratch_ptr = + int scratch_pool_id, std::int64_t bytes, bool force_shrink) { + // Multiple ParallelFor/Reduce Teams can call this function at the same time + // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race + // condition. + if (m_team_scratch_current_size[scratch_pool_id] == 0) { + m_team_scratch_current_size[scratch_pool_id] = bytes; + m_team_scratch_ptr[scratch_pool_id] = Kokkos::kokkos_malloc( "Kokkos::Experimental::SYCLDeviceUSMSpace::TeamScratchMemory", - m_team_scratch_current_size); + m_team_scratch_current_size[scratch_pool_id]); } - if ((bytes > m_team_scratch_current_size) || - ((bytes < m_team_scratch_current_size) && (force_shrink))) { - m_team_scratch_current_size = bytes; - m_team_scratch_ptr = + if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || + ((bytes < m_team_scratch_current_size[scratch_pool_id]) && + (force_shrink))) { + m_team_scratch_current_size[scratch_pool_id] = bytes; + m_team_scratch_ptr[scratch_pool_id] = Kokkos::kokkos_realloc( - m_team_scratch_ptr, m_team_scratch_current_size); + m_team_scratch_ptr[scratch_pool_id], + m_team_scratch_current_size[scratch_pool_id]); } - return m_team_scratch_ptr; + return m_team_scratch_ptr[scratch_pool_id]; +} + +void SYCLInternal::register_team_scratch_event(int scratch_pool_id, + sycl::event event) { + m_team_scratch_pool[scratch_pool_id] = event; } uint32_t SYCLInternal::impl_get_instance_id() const { return m_instance_id; } @@ -187,11 +204,14 @@ void SYCLInternal::finalize() { m_scratchFlagsCount = 0; m_scratchFlags = nullptr; - if (m_team_scratch_current_size > 0) - Kokkos::kokkos_free( - m_team_scratch_ptr); - m_team_scratch_current_size = 0; - m_team_scratch_ptr = nullptr; + for (int i = 0; i < m_n_team_scratch; ++i) { + if (m_team_scratch_current_size[i] > 0) { + Kokkos::kokkos_free( + m_team_scratch_ptr[i]); + m_team_scratch_current_size[i] = 0; + m_team_scratch_ptr[i] = nullptr; + } + } for (auto& usm_mem : m_indirectKernelMem) usm_mem.reset(); // guard erasing from all_queues diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 58775647f0..328e8cbd8a 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -45,8 +45,11 @@ class SYCLInternal { sycl::device_ptr scratch_space(const std::size_t size); sycl::device_ptr scratch_flags(const std::size_t size); - sycl::device_ptr resize_team_scratch_space(std::int64_t bytes, + int acquire_team_scratch_space(); + sycl::device_ptr resize_team_scratch_space(int scratch_pool_id, + std::int64_t bytes, bool force_shrink = false); + void register_team_scratch_event(int scratch_pool_id, sycl::event event); uint32_t impl_get_instance_id() const; static int m_syclDev; @@ -62,8 +65,12 @@ class SYCLInternal { // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; - int64_t m_team_scratch_current_size = 0; - sycl::device_ptr m_team_scratch_ptr = nullptr; + // Team Scratch Level 1 Space + static constexpr int m_n_team_scratch = 10; + mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; + mutable sycl::device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; + mutable int m_current_team_scratch = 0; + mutable sycl::event m_team_scratch_pool[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp index 80f5db0558..a0368102e0 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -386,6 +386,7 @@ class ParallelFor, // Only let one ParallelFor/Reduce modify the team scratch memory. The // constructor acquires the mutex which is released in the destructor. std::scoped_lock m_scratch_lock; + int m_scratch_pool_id = -1; template sycl::event sycl_direct_launch(const Policy& policy, @@ -451,10 +452,9 @@ class ParallelFor, inline void execute() const { if (m_league_size == 0) return; + auto& space = *m_policy.space().impl_internal_space_instance(); Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = m_policy.space() - .impl_internal_space_instance() - ->get_indirect_kernel_mem(); + indirectKernelMem = space.get_indirect_kernel_mem(); auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( m_functor, indirectKernelMem); @@ -462,6 +462,7 @@ class ParallelFor, sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); + space.register_team_scratch_event(m_scratch_pool_id, event); } ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) @@ -487,9 +488,11 @@ class ParallelFor, // Functor's reduce memory, team scan memory, and team shared memory depend // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); m_global_scratch_ptr = static_cast>(space.resize_team_scratch_space( + m_scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < @@ -552,6 +555,7 @@ class ParallelReduce m_scratch_lock; + int m_scratch_pool_id = -1; template @@ -837,6 +841,8 @@ class ParallelReduce>(space.resize_team_scratch_space( + m_scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < diff --git a/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp b/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp index 8f2d6c68bd..d7ed93d49e 100644 --- a/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp +++ b/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp @@ -110,9 +110,6 @@ void sycl_queue_scratch_test( } // namespace Impl TEST(sycl, team_scratch_1_queues) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif int N = 1000000; int T = 10; int M_base = 150; From c5d2c3dbf48f611d4d3457fc14bd901034d8d5ae Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 22 Mar 2023 11:21:34 -0400 Subject: [PATCH 341/496] m_team_scratch_pool -> m_team_scratch_event --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 4 ++-- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index c361d5727f..f60b198578 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -141,7 +141,7 @@ int SYCLInternal::acquire_team_scratch_space() { &m_current_team_scratch, m_n_team_scratch - 1, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); - m_team_scratch_pool[current_team_scratch].wait_and_throw(); + m_team_scratch_event[current_team_scratch].wait_and_throw(); return current_team_scratch; } @@ -172,7 +172,7 @@ sycl::device_ptr SYCLInternal::resize_team_scratch_space( void SYCLInternal::register_team_scratch_event(int scratch_pool_id, sycl::event event) { - m_team_scratch_pool[scratch_pool_id] = event; + m_team_scratch_event[scratch_pool_id] = event; } uint32_t SYCLInternal::impl_get_instance_id() const { return m_instance_id; } diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 328e8cbd8a..d669151e83 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -70,7 +70,7 @@ class SYCLInternal { mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; mutable sycl::device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; mutable int m_current_team_scratch = 0; - mutable sycl::event m_team_scratch_pool[m_n_team_scratch] = {}; + mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< From a798ac7bbd8c780ceb334d9f5ef3c91fa0e55230 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 22 Mar 2023 11:34:09 -0400 Subject: [PATCH 342/496] Explain acquire_team_scratch_space --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index f60b198578..24f38d7c06 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -137,6 +137,9 @@ void SYCLInternal::initialize(const sycl::queue& q) { } int SYCLInternal::acquire_team_scratch_space() { + // Grab the next scratch memory allocation. We must make sure that the last + // kernel using the allocation has completed, so we wait for the event that + // was registered with that kernel. int current_team_scratch = desul::atomic_fetch_inc_mod( &m_current_team_scratch, m_n_team_scratch - 1, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); From f0119709fb915bc1ff9f9e7973066d0b5af84316 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 22 Mar 2023 16:22:31 -0600 Subject: [PATCH 343/496] Move Cuda/Kokkos_Cuda_NvidiaGpuArchitectures.hpp -> impl/Kokkos_NvidiaGpuArchitectures.hpp --- core/src/Kokkos_Macros.hpp | 2 +- .../Kokkos_NvidiaGpuArchitectures.hpp} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename core/src/{Cuda/Kokkos_Cuda_NvidiaGpuArchitectures.hpp => impl/Kokkos_NvidiaGpuArchitectures.hpp} (100%) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index c672de24b0..00f4686ad8 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -55,7 +55,7 @@ #ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H #include -#include +#include #endif //---------------------------------------------------------------------------- diff --git a/core/src/Cuda/Kokkos_Cuda_NvidiaGpuArchitectures.hpp b/core/src/impl/Kokkos_NvidiaGpuArchitectures.hpp similarity index 100% rename from core/src/Cuda/Kokkos_Cuda_NvidiaGpuArchitectures.hpp rename to core/src/impl/Kokkos_NvidiaGpuArchitectures.hpp From b097f74ce2b2f490a67f09d454c1e18764c7b896 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 22 Mar 2023 16:38:54 -0600 Subject: [PATCH 344/496] Drive-by fix typos "fix {to -> too} many" Co-authored-by: Phil Miller --- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp index 34fa997e23..549aac66f0 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -303,7 +303,7 @@ class TeamPolicyInternal (sizeof(double) + m_thread_scratch_size[0]); return std::min({ int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), - // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. + // FIXME_SYCL Avoid requesting too many registers on NVIDIA GPUs. #if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) 256, #endif @@ -333,7 +333,7 @@ class TeamPolicyInternal m_thread_scratch_size[0]); return std::min({ int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), - // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. + // FIXME_SYCL Avoid requesting too many registers on NVIDIA GPUs. #if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) 256, #endif From b0cc5a07bbeb886648b78eb131b53bd4812e1804 Mon Sep 17 00:00:00 2001 From: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> Date: Thu, 23 Mar 2023 16:32:08 -0600 Subject: [PATCH 345/496] simd: Fixed an incorrectly returning size for uint64_t in avx2 (#6004) * Fixed a size error in simd uint64_t avx2 * Converted unit test to compile time checks * Removed an unused variable --- simd/src/Kokkos_SIMD_AVX2.hpp | 2 +- simd/unit_tests/TestSIMD.cpp | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index 1732c33ca5..86b944efa5 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -804,7 +804,7 @@ class simd> { KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { - return 8; + return 4; } template , bool> = false> diff --git a/simd/unit_tests/TestSIMD.cpp b/simd/unit_tests/TestSIMD.cpp index 7a4ecf19ed..92c77033b9 100644 --- a/simd/unit_tests/TestSIMD.cpp +++ b/simd/unit_tests/TestSIMD.cpp @@ -486,3 +486,32 @@ TEST(simd, device) { Kokkos::parallel_for(Kokkos::RangePolicy>(0, 1), simd_device_functor()); } + +TEST(simd, test_size) { +#if defined(KOKKOS_ARCH_AVX512XEON) + constexpr auto width = 8; + using Abi = Kokkos::Experimental::simd_abi::avx512_fixed_size; + static_assert(width == + Kokkos::Experimental::simd::size()); + +#elif defined(KOKKOS_ARCH_AVX2) + constexpr auto width = 4; + using Abi = Kokkos::Experimental::simd_abi::avx2_fixed_size; + +#elif defined(__ARM_NEON) + constexpr auto width = 2; + using Abi = Kokkos::Experimental::simd_abi::neon_fixed_size; + +#else + constexpr auto width = 1; + using Abi = Kokkos::Experimental::simd_abi::scalar; + static_assert(width == + Kokkos::Experimental::simd::size()); +#endif + + static_assert(width == Kokkos::Experimental::simd::size()); + static_assert(width == Kokkos::Experimental::simd::size()); + static_assert(width == + Kokkos::Experimental::simd::size()); + static_assert(width == Kokkos::Experimental::simd::size()); +} From a6f27bf738df97b3274679ad8918a1c10249c849 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 24 Mar 2023 13:17:55 +0000 Subject: [PATCH 346/496] Pass local_accessor directly instead --- core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp | 41 +++++++++---------- core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp | 8 ++-- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 8 ++-- 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp index 1a3350cedc..4bdedc64e1 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp @@ -37,7 +37,7 @@ inline constexpr bool use_shuffle_based_algorithm = namespace SYCLReduction { template std::enable_if_t> workgroup_reduction( - sycl::nd_item& item, sycl::local_ptr local_mem, + sycl::nd_item& item, sycl::local_accessor local_mem, sycl::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count, const ReducerType& final_reducer, @@ -109,7 +109,7 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( - sycl::nd_item& item, sycl::local_ptr local_mem, + sycl::nd_item& item, sycl::local_accessor local_mem, ValueType local_value, sycl::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -271,8 +271,8 @@ class ParallelReduce, instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = - [&](sycl::local_accessor local_mem, - sycl::local_accessor num_teams_done, + [&](sycl::local_accessor local_mem, + sycl::local_accessor num_teams_done, sycl::device_ptr results_ptr) { const auto begin = policy.begin(); @@ -304,9 +304,8 @@ class ParallelReduce, item.barrier(sycl::access::fence_space::local_space); SYCLReduction::workgroup_reduction<>( - item, local_mem.get_pointer(), results_ptr, - device_accessible_result_ptr, value_count, reducer, false, - std::min(size, wgroup_size)); + item, local_mem, results_ptr, device_accessible_result_ptr, + value_count, reducer, false, std::min(size, wgroup_size)); if (local_id == 0) { sycl::atomic_ref, } SYCLReduction::workgroup_reduction<>( - item, local_mem.get_pointer(), results_ptr, + item, local_mem, results_ptr, device_accessible_result_ptr, value_count, reducer, true, std::min(n_wgroups, wgroup_size)); } @@ -346,7 +345,7 @@ class ParallelReduce, } SYCLReduction::workgroup_reduction<>( - item, local_mem.get_pointer(), local_value, results_ptr, + item, local_mem, local_value, results_ptr, device_accessible_result_ptr, reducer, false, std::min(size, wgroup_size)); @@ -370,7 +369,7 @@ class ParallelReduce, } SYCLReduction::workgroup_reduction<>( - item, local_mem.get_pointer(), local_value, results_ptr, + item, local_mem, local_value, results_ptr, device_accessible_result_ptr, reducer, true, std::min(n_wgroups, wgroup_size)); } @@ -380,7 +379,7 @@ class ParallelReduce, }; auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { - sycl::local_accessor num_teams_done(1, cgh); + sycl::local_accessor num_teams_done(1, cgh); auto dummy_reduction_lambda = reduction_lambda_factory({1, cgh}, num_teams_done, nullptr); @@ -421,7 +420,7 @@ class ParallelReduce, wgroup_size - 1) / wgroup_size; - sycl::local_accessor local_mem( + sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh); cgh.depends_on(memcpy_events); @@ -608,9 +607,9 @@ class ParallelReduce 1) { auto n_wgroups = (size + wgroup_size - 1) / wgroup_size; auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { - sycl::local_accessor local_mem( + sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh); - sycl::local_accessor num_teams_done(1, cgh); + sycl::local_accessor num_teams_done(1, cgh); const BarePolicy bare_policy = m_policy; @@ -652,9 +651,8 @@ class ParallelReduce( - item, local_mem.get_pointer(), results_ptr, - device_accessible_result_ptr, value_count, reducer, false, - std::min(size, wgroup_size)); + item, local_mem, results_ptr, device_accessible_result_ptr, + value_count, reducer, false, std::min(size, wgroup_size)); if (local_id == 0) { sycl::atomic_ref( - item, local_mem.get_pointer(), results_ptr, - device_accessible_result_ptr, value_count, reducer, true, - std::min(n_wgroups, wgroup_size)); + item, local_mem, results_ptr, device_accessible_result_ptr, + value_count, reducer, true, std::min(n_wgroups, wgroup_size)); } } else { value_type local_value; @@ -695,7 +692,7 @@ class ParallelReduce( - item, local_mem.get_pointer(), local_value, results_ptr, + item, local_mem, local_value, results_ptr, device_accessible_result_ptr, reducer, false, std::min(size, wgroup_size)); @@ -719,7 +716,7 @@ class ParallelReduce( - item, local_mem.get_pointer(), local_value, results_ptr, + item, local_mem, local_value, results_ptr, device_accessible_result_ptr, reducer, true, std::min(n_wgroups, wgroup_size)); } diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp index 3bd25b1f23..5176c0f14e 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp @@ -31,7 +31,7 @@ namespace Impl { // total sum. template void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, - sycl::local_ptr local_mem, + sycl::local_accessor local_mem, ValueType& local_value, unsigned int global_range) { // subgroup scans auto sg = item.get_sub_group(); @@ -136,7 +136,7 @@ class ParallelScanSYCLBase { q.get_device() .template get_info() .front(); - sycl::local_accessor local_mem( + sycl::local_accessor local_mem( sycl::range<1>((wgroup_size + min_subgroup_size - 1) / min_subgroup_size), cgh); @@ -160,8 +160,8 @@ class ParallelScanSYCLBase { else reducer.init(&local_value); - workgroup_scan<>(item, reducer, local_mem.get_pointer(), - local_value, wgroup_size); + workgroup_scan<>(item, reducer, local_mem, local_value, + wgroup_size); if (n_wgroups > 1 && local_id == wgroup_size - 1) group_results[item.get_group_linear_id()] = diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp index 62a41fe91f..c1a3133428 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -668,7 +668,7 @@ class ParallelReduce( - item, local_mem.get_pointer(), results_ptr, + item, local_mem, results_ptr, device_accessible_result_ptr, value_count, reducer, false, std::min(size, item.get_local_range()[0] * @@ -696,7 +696,7 @@ class ParallelReduce( - item, local_mem.get_pointer(), results_ptr, + item, local_mem, results_ptr, device_accessible_result_ptr, value_count, reducer, true, std::min(n_wgroups, item.get_local_range()[0] * @@ -716,7 +716,7 @@ class ParallelReduce( - item, local_mem.get_pointer(), local_value, results_ptr, + item, local_mem, local_value, results_ptr, device_accessible_result_ptr, reducer, false, std::min(size, item.get_local_range()[0] * @@ -742,7 +742,7 @@ class ParallelReduce( - item, local_mem.get_pointer(), local_value, results_ptr, + item, local_mem, local_value, results_ptr, device_accessible_result_ptr, reducer, true, std::min(n_wgroups, item.get_local_range()[0] * item.get_local_range()[1])); From 904fb32ecd3a36c52419e46fe459c7f56abe57c1 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Fri, 24 Mar 2023 16:23:04 -0400 Subject: [PATCH 347/496] Fix warning in some user code when using std::memcpy --- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index d387474fe8..0bc3529530 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -414,7 +414,8 @@ struct HIPParallelLaunchKernelInvokerconstantMemHostStaging; - std::memcpy(staging, &driver, sizeof(DriverType)); + std::memcpy(static_cast(staging), + static_cast(&driver), sizeof(DriverType)); // Copy functor asynchronously from there to constant memory on the device KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbolAsync( From 33b905be8471ad1f18f09ac7c15c790a3b2a4b0d Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Fri, 24 Mar 2023 16:48:29 -0400 Subject: [PATCH 348/496] CMake: update package compatibility mode when building within Trilinos --- cmake/kokkos_install.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kokkos_install.cmake b/cmake/kokkos_install.cmake index c65c2af52b..fb658239d8 100644 --- a/cmake/kokkos_install.cmake +++ b/cmake/kokkos_install.cmake @@ -38,7 +38,7 @@ ELSE() WRITE_BASIC_PACKAGE_VERSION_FILE("${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" - COMPATIBILITY SameMajorVersion) + COMPATIBILITY AnyNewerVersion) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/${PACKAGE_NAME}") From a7a2d715cd9dff1cefa66b9f2fc65dafb5df0719 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 27 Mar 2023 12:37:23 -0400 Subject: [PATCH 349/496] SYCL: Make is_device_copyable future-proof (#6009) * SYCL: Make is_device_copyable future-proof * Try Damien's suggestion Co-authored-by: Damien L-G --------- Co-authored-by: Damien L-G --- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 58775647f0..53fdc301f4 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -323,10 +323,29 @@ struct sycl::is_device_copyable< Kokkos::Experimental::Impl::SYCLFunctionWrapper> : std::true_type {}; +// FIXME_SYCL Remove when this specialization when specializations for +// sycl::device_copyable also apply to const-qualified types. +template +struct NonTriviallyCopyableAndDeviceCopyable { + NonTriviallyCopyableAndDeviceCopyable( + const NonTriviallyCopyableAndDeviceCopyable&) {} +}; + +template +struct is_device_copyable> + : std::true_type {}; + +static_assert( + !std::is_trivially_copyable_v< + NonTriviallyCopyableAndDeviceCopyable> && + is_device_copyable_v>); + template struct sycl::is_device_copyable< const Kokkos::Experimental::Impl::SYCLFunctionWrapper> + false>, + std::enable_if_t>>> : std::true_type {}; #endif #endif From 7c7ae9abfe3daba52774b6799e2ac2640c931288 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 27 Mar 2023 15:47:48 -0400 Subject: [PATCH 350/496] desul: Move lock_array_copied from global scope (#5999) * desul: Move lock_array_copied fromg global scope * lock_array_copied->once --- tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp | 12 ++++-------- tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp | 12 ++++-------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp b/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp index 4ff7196eed..b4dc4dae74 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp @@ -108,11 +108,6 @@ __device__ inline void unlock_address_cuda(void* ptr, desul::MemoryScopeNode) { atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[offset], 0); } -// Make lock_array_copied an explicit translation unit scope thingy -namespace { -static int lock_array_copied = 0; -} // namespace - #ifdef __CUDACC_RDC__ inline #else @@ -120,15 +115,16 @@ inline static #endif void copy_cuda_lock_arrays_to_device() { - if (lock_array_copied == 0) { + static bool once = []() { cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE, &CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, sizeof(int32_t*)); cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE, &CUDA_SPACE_ATOMIC_LOCKS_NODE_h, sizeof(int32_t*)); - } - lock_array_copied = 1; + return true; + }(); + (void)once; } } // namespace Impl diff --git a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp index 53144fbc4c..b80e2d4599 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp @@ -115,11 +115,6 @@ __device__ inline void unlock_address_hip(void* ptr, desul::MemoryScopeNode) { atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[offset], 0); } -// Make lock_array_copied an explicit translation unit scope thing -namespace { -static int lock_array_copied = 0; -} // namespace - #ifdef __CLANG_RDC__ inline #else @@ -127,15 +122,16 @@ inline static #endif void copy_hip_lock_arrays_to_device() { - if (lock_array_copied == 0) { + static bool once = []() { (void)hipMemcpyToSymbol(HIP_SYMBOL(HIP_SPACE_ATOMIC_LOCKS_DEVICE), &HIP_SPACE_ATOMIC_LOCKS_DEVICE_h, sizeof(int32_t*)); (void)hipMemcpyToSymbol(HIP_SYMBOL(HIP_SPACE_ATOMIC_LOCKS_NODE), &HIP_SPACE_ATOMIC_LOCKS_NODE_h, sizeof(int32_t*)); - } - lock_array_copied = 1; + return true; + }(); + (void)once; } } // namespace Impl From 6f16f417ad49b74fc8bd1f72de542241d1a5a474 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 28 Mar 2023 11:24:24 -0400 Subject: [PATCH 351/496] Fix namespace for is_device_copyable --- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 65244e42de..51a617054d 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -339,13 +339,13 @@ struct NonTriviallyCopyableAndDeviceCopyable { }; template -struct is_device_copyable> +struct sycl::is_device_copyable> : std::true_type {}; static_assert( !std::is_trivially_copyable_v< NonTriviallyCopyableAndDeviceCopyable> && - is_device_copyable_v>); + sycl::is_device_copyable_v>); template struct sycl::is_device_copyable< From 89bdbaad39a3adc66eba3823b873463d815668df Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 29 Mar 2023 16:31:26 -0400 Subject: [PATCH 352/496] Fixup 4.0 change log (#6015) * Fixup 4.0 changelog: `KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_*` macros oversight was in release 3.6 (not 3.2) * Fixup 4.0 changelog: drop deprecated entries duplicated from 3.7 changelog * Mention removal of deprecated code 3 in change log * List removing deprecated code 3 under incompatibilities --- CHANGELOG.md | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f3f0086267..03cf75ff88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -72,23 +72,12 @@ - Remove Kokkos_ENABLE_CUDA_LDG_INTRINSIC option [\#5623](https://github.com/kokkos/kokkos/pull/5623) - Don't rely on synchronization behavior of default stream in CUDA and HIP - this potentially will break unintended implicit synchronization with other libraries such as MPI [\#5391](https://github.com/kokkos/kokkos/pull/5391) - Make ExecutionSpace::concurrency() a non-static member function [\#5655](https://github.com/kokkos/kokkos/pull/5655) and related PRs +- Remove code guarded by `KOKKOS_ENABLE_DEPRECATED_CODE_3` ### Deprecations -- Guard against non-public header inclusion [\#5178](https://github.com/kokkos/kokkos/pull/5178) -- Raise deprecation warnings if non empty WorkTag class is used [\#5230](https://github.com/kokkos/kokkos/pull/5230) -- Deprecate `parallel_*` overloads taking the label as trailing argument [\#5141](https://github.com/kokkos/kokkos/pull/5141) -- Deprecate nested types in functional [\#5185](https://github.com/kokkos/kokkos/pull/5185) -- Deprecate `InitArguments` struct and replace it with `InitializationSettings` [\#5135](https://github.com/kokkos/kokkos/pull/5135) -- Deprecate `finalize_all()` [\#5134](https://github.com/kokkos/kokkos/pull/5134) -- Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120) -- Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117) -- Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111) -- Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957) -- Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810) -- Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382) - Deprecate `CudaUVMSpace::available()` which always returned `true` [\#5614](https://github.com/kokkos/kokkos/pull/5614) - Deprecate `volatile`-qualified members from `Kokkos::pair` and `Kokkos::complex` [\#5412](https://github.com/kokkos/kokkos/pull/5412) -- Deprecate `KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_*` macros [\#5824](https://github.com/kokkos/kokkos/pull/5824) (oversight in 3.2) +- Deprecate `KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_*` macros [\#5824](https://github.com/kokkos/kokkos/pull/5824) (oversight in 3.6) ### Bug Fixes - Avoid allocating memory for `UniqueToken` [\#5300](https://github.com/kokkos/kokkos/pull/5300) From 0476985291eac25685885a5e4c68610dc5a09af5 Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Thu, 30 Mar 2023 07:00:27 -0600 Subject: [PATCH 353/496] Add half_t and bhalf_t limits (#5778) * Add half_t and bhalf_t limits * Try using constexpr * Revert "Try using constexpr" This reverts commit 1b399bdd43da2b648d5f7cedbade0dfff51f8e43. * Fix norm_min_helper value type * Add bias to epsilon when bhalf_t is float * Remove bias. Prevent compiler from optimizing out cast. * Fix typo * Attempt to fix CI Werror * core/unit_test: Add inline comment * Add half_t docs --- core/src/Kokkos_Half.hpp | 317 ++++++++++++++++++++++++++- core/unit_test/TestHalfOperators.hpp | 31 ++- 2 files changed, 337 insertions(+), 11 deletions(-) diff --git a/core/src/Kokkos_Half.hpp b/core/src/Kokkos_Half.hpp index 9231fac5ff..82dd55549d 100644 --- a/core/src/Kokkos_Half.hpp +++ b/core/src/Kokkos_Half.hpp @@ -21,8 +21,10 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF #endif -#include #include +#include + +#include #include // istream & ostream for extraction and insertion ops #include @@ -1005,6 +1007,319 @@ cast_from_bhalf(bhalf_t val) { #else #define KOKKOS_BHALF_T_IS_FLOAT false #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED + ////////////// BEGIN HALF_T (binary16) limits ////////////// + // clang-format off +// '\brief:' below are from the libc definitions for float and double: +// https://www.gnu.org/software/libc/manual/html_node/Floating-Point-Parameters.html +// +// The arithmetic encoding and equations below are derived from: +// Ref1: https://en.wikipedia.org/wiki/Single-precision_floating-point_format +// Ref2: https://en.wikipedia.org/wiki/Exponent_bias +// Ref3; https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html +// +// Some background on the magic numbers 2**10=1024 and 2**15=32768 used below: +// +// IMPORTANT: For IEEE754 encodings, see Ref1. +// +// For binary16, we have B = 2 and p = 16 with 2**16 possible significands. +// The binary16 format is: [s e e e e e f f f f f f f f f f] +// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +// s: signed bit (1 bit) +// e: exponent bits (5 bits) +// f: fractional bits (10 bits) +// +// E_bias = 2**(n_exponent_bits - 1) - 1 = 2**(5 - 1) - 1 = 15 +// E_subnormal = 00000 (base2) +// E_infinity = 11111 (base2) +// E_min = 1 - E_bias = 1 - 15 +// E_max = 2**5 - 1 - E_bias = 2**5 - 1 - 15 = 16 +// +// 2**10=1024 is the smallest denominator that is representable in binary16: +// [s e e e e e f f f f f f f f f f] +// [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1] +// which is: 1 / 2**-10 +// +// +// 2**15 is the largest exponent factor representable in binary16, for example the +// largest integer value representable in binary16 is: +// [s e e e e e f f f f f f f f f f] +// [0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1] +// which is: 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1)) = +// 2**15 * (1 + 0.9990234375) = +// 65504.0 +// + +/// \brief: Infinity. +/// +/// base2 encoding: bits [10,14] set +/// #define KOKKOS_IMPL_HALF_T_HUGE_VALH 0x7c00 +/// Binary16 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT + +/// \brief: Minimum normalized number +/// +/// Stdc defines this as the smallest number (representable in binary16). +/// +/// Binary16 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: -1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1) +/// = -2**15 * (1 + (2**10 - 1) / 2**10) +template <> +struct Kokkos::Experimental::Impl::finite_min_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = -65504.0F; +}; + +/// \brief: Maximum normalized number +/// +/// Stdc defines this as the maximum number (representable in binary16). +/// +/// Binary16 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1) +/// = 2**15 * (1 + (2**10 - 1) / 2**10) +template <> +struct Kokkos::Experimental::Impl::finite_max_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 65504.0F; +}; + +/// \brief: This is the difference between 1 and the smallest floating point +/// number of type binary16 that is greater than 1 +/// +/// Smallest number in binary16 that is greater than 1 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**3 + 2**2 + 2**1 + 2**0 - 15) * (1 + 2**-10) +/// = 2**0 * (1 + 2**-10) +/// = 1.0009765625 +/// +/// Lastly, 1 - 1.0009765625 = 0.0009765625. +template <> +struct Kokkos::Experimental::Impl::epsilon_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0.0009765625F; +}; + +/// @brief: The largest possible rounding error in ULPs +/// +/// This simply uses the maximum rounding error. +/// +/// Reference: https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#689 +template <> +struct Kokkos::Experimental::Impl::round_error_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0.5F; +}; + +/// \brief: Minimum normalized positive half precision number +/// +/// Stdc defines this as the minimum normalized positive floating +/// point number that is representable in type binary16 +/// +/// Smallest number in binary16 that is greater than 1 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**0 - 15) * (1) +/// = 2**-14 +template <> +struct Kokkos::Experimental::Impl::norm_min_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0.00006103515625F; +}; + +/// \brief: Quiet not a half precision number +/// +/// IEEE 754 defines this as all exponent bits high. +/// +/// Quiet NaN in binary16: +/// [s e e e e e f f f f f f f f f f] +/// [1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +template <> +struct Kokkos::Experimental::Impl::quiet_NaN_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0xfc000; +}; + +/// \brief: Signaling not a half precision number +/// +/// IEEE 754 defines this as all exponent bits and the first fraction bit high. +/// +/// Quiet NaN in binary16: +/// [s e e e e e f f f f f f f f f f] +/// [1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +template <> +struct Kokkos::Experimental::Impl::signaling_NaN_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0xfe000; +}; + +/// \brief: Number of digits in the matissa that can be represented +/// without losing precision. +/// +/// Stdc defines this as the number of base-RADIX digits in the floating point mantissa for the binary16 data type. +/// +/// In binary16, we have 10 fractional bits plus the implicit leading 1. +template <> +struct Kokkos::Experimental::Impl::digits_helper { + static constexpr int value = 11; +}; + +/// \brief: "The number of base-10 digits that can be represented by the type T without change" +/// Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/digits10. +/// +/// "For base-radix types, it is the value of digits() (digits - 1 for floating-point types) multiplied by log10(radix) and rounded down." +/// Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/digits10. +/// +/// This is: floor(11 - 1 * log10(2)) +template <> +struct Kokkos::Experimental::Impl::digits10_helper< + Kokkos::Experimental::half_t> { + static constexpr int value = 3; +}; + +/// \brief: Value of the base of the exponent representation. +/// +/// Stdc defined this as the value of the base, or radix, of the exponent representation. +template <> +struct Kokkos::Experimental::Impl::radix_helper { + static constexpr int value = 2; +}; + +/// \brief: This is the smallest possible exponent value +/// +/// Stdc defines this as the smallest possible exponent value for type binary16. +/// More precisely, it is the minimum negative integer such that the value min_exponent_helper +/// raised to this power minus 1 can be represented as a normalized floating point number of type float. +/// +/// In binary16: +/// [s e e e e e f f f f f f f f f f] +/// [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**0 - 15) * (1 + 0) +/// = 2**-14 +/// +/// with a bias of one from (C11 5.2.4.2.2), gives -13; +template <> +struct Kokkos::Experimental::Impl::min_exponent_helper< + Kokkos::Experimental::half_t> { + static constexpr int value = -13; +}; + +/// \brief: This is the largest possible exponent value +/// +/// In binary16: +/// [s e e e e e f f f f f f f f f f] +/// [0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 0) +/// = 2**(30 - 15) +/// = 2**15 +/// +/// with a bias of one from (C11 5.2.4.2.2), gives 16; +template <> +struct Kokkos::Experimental::Impl::max_exponent_helper< + Kokkos::Experimental::half_t> { + static constexpr int value = 16; +}; +#endif +////////////// END HALF_T (binary16) limits ////////////// + +////////////// BEGIN BHALF_T (bfloat16) limits ////////////// +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +// Minimum normalized number +template <> +struct Kokkos::Experimental::Impl::finite_min_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = -3.38953139e38; +}; +// Maximum normalized number +template <> +struct Kokkos::Experimental::Impl::finite_max_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 3.38953139e38; +}; +// 1/2^7 +template <> +struct Kokkos::Experimental::Impl::epsilon_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 0.0078125F; +}; +template <> +struct Kokkos::Experimental::Impl::round_error_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 0.5F; +}; +// Minimum normalized positive bhalf number +template <> +struct Kokkos::Experimental::Impl::norm_min_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 1.1754494351e-38; +}; +// Quiet not a bhalf number +template <> +struct Kokkos::Experimental::Impl::quiet_NaN_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 0x7fc000; +}; +// Signaling not a bhalf number +template <> +struct Kokkos::Experimental::Impl::signaling_NaN_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 0x7fe000; +}; +// Number of digits in the matissa that can be represented +// without losing precision. +template <> +struct Kokkos::Experimental::Impl::digits_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr int value = 2; +}; +// 7 - 1 * log10(2) +template <> +struct Kokkos::Experimental::Impl::digits10_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr int value = 1; +}; +// Value of the base of the exponent representation. +template <> +struct Kokkos::Experimental::Impl::radix_helper { + static constexpr int value = 2; +}; +// This is the smallest possible exponent value +// with a bias of one (C11 5.2.4.2.2). +template <> +struct Kokkos::Experimental::Impl::min_exponent_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr int value = -125; +}; +// This is the largest possible exponent value +// with a bias of one (C11 5.2.4.2.2). +template <> +struct Kokkos::Experimental::Impl::max_exponent_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr int value = 128; +}; +#endif +////////////// END BHALF_T (bfloat16) limits ////////////// + #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF diff --git a/core/unit_test/TestHalfOperators.hpp b/core/unit_test/TestHalfOperators.hpp index 6a2bc359e5..29844a3c6a 100644 --- a/core/unit_test/TestHalfOperators.hpp +++ b/core/unit_test/TestHalfOperators.hpp @@ -17,8 +17,6 @@ #ifndef TESTHALFOPERATOR_HPP_ #define TESTHALFOPERATOR_HPP_ namespace Test { -#define FP16_EPSILON 0.0009765625F // 1/2^10 -#define BF16_EPSILON 0.0078125F // 1/2^7 using namespace Kokkos::Experimental; using ExecutionSpace = TEST_EXECSPACE; using ScalarType = double; @@ -26,9 +24,19 @@ using ViewType = Kokkos::View; using ViewTypeHost = Kokkos::View; KOKKOS_FUNCTION const half_t& accept_ref(const half_t& a) { return a; } +KOKKOS_FUNCTION +double accept_ref_expected(const half_t& a) { + double tmp = static_cast(a); + return tmp; +} #if !KOKKOS_BHALF_T_IS_FLOAT KOKKOS_FUNCTION const bhalf_t& accept_ref(const bhalf_t& a) { return a; } +KOKKOS_FUNCTION +double accept_ref_expected(const bhalf_t& a) { + double tmp = static_cast(a); + return tmp; +} #endif // !KOKKOS_BHALF_T_IS_FLOAT enum OP_TESTS { @@ -886,8 +894,16 @@ struct Functor_TestHalfOperators { // actual_lhs(TW) = h_lhs <=> h_rhs; // Need C++20? // expected_lhs(TW) = d_lhs <=> d_rhs; // Need C++20? - actual_lhs(PASS_BY_REF) = static_cast(accept_ref(h_lhs)); - expected_lhs(PASS_BY_REF) = d_lhs; + actual_lhs(PASS_BY_REF) = static_cast(accept_ref(h_lhs)); + + // Use accept_ref and accept_ref_expected to ensure the compiler + // does not optimize out the casts half_type -> double -> half_type. + // Note that these casts are accompanied by rounding. For the bhalf_t + // epsilon, these rounding policies used for casting is enough to cause + // the unit tests to fail. + // In short, one cannot simply assign static_cast(h_lhs) to + // expected_lhs(PASS_BY_REF). + expected_lhs(PASS_BY_REF) = accept_ref_expected(h_lhs); half_tmp = static_cast(h_lhs); tmp_ptr = &(tmp_lhs = half_tmp); @@ -910,12 +926,7 @@ struct Functor_TestHalfOperators { template void __test_half_operators(half_type h_lhs, half_type h_rhs) { - double epsilon = FLT_EPSILON; - - if (std::is_same::value) - epsilon = FP16_EPSILON; - if (std::is_same::value) - epsilon = BF16_EPSILON; + double epsilon = Kokkos::Experimental::epsilon::value; Functor_TestHalfOperators f_device(h_lhs, h_rhs); Functor_TestHalfOperators f_host(h_lhs, h_rhs); From fdb089b34a3c9c087447a52709a436859d117b1f Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Thu, 30 Mar 2023 07:04:37 -0600 Subject: [PATCH 354/496] Add UnorderedMapInsertOps for coo2crs (#5877) * Add UnorderedMapInsertOps for coo2crs * Apply suggestions from code review Co-authored-by: Daniel Arndt * Implement PR feedback Fix OpenMP test failures Fix CI build error Increase size of expected_values * Create host mirrors for checking expected values Ensure m_insert_op gets copied too * Address PR feedback. Depends on #5899. * Fixups from rebase and clang 15 * cleanup * Ensure if evaluates at runtime with constexpr * Do not allow insert ops on sets * Silence intel error 869 * Reduce unit test runtime * Update containers/src/Kokkos_UnorderedMap.hpp Co-authored-by: Daniel Arndt * Fix typo --------- Co-authored-by: Daniel Arndt --- containers/src/Kokkos_UnorderedMap.hpp | 60 ++++++-- containers/unit_tests/TestUnorderedMap.hpp | 157 ++++++++++++++++++--- 2 files changed, 192 insertions(+), 25 deletions(-) diff --git a/containers/src/Kokkos_UnorderedMap.hpp b/containers/src/Kokkos_UnorderedMap.hpp index 7a89b189e8..5ed51c2948 100644 --- a/containers/src/Kokkos_UnorderedMap.hpp +++ b/containers/src/Kokkos_UnorderedMap.hpp @@ -119,6 +119,36 @@ class UnorderedMapInsertResult { uint32_t m_status; }; +/// \class UnorderedMapInsertOpTypes +/// +/// \brief Operations applied to the values array upon subsequent insertions. +/// +/// The default behavior when a k,v pair already exists in the UnorderedMap is +/// to perform no operation. Alternatively, the caller may select to +/// instantiate the UnorderedMap with the AtomicAdd insert operator such that +/// duplicate keys accumulate values into the given values array entry. +/// \tparam ValueTypeView The UnorderedMap value array type. +/// \tparam ValuesIdxType The index type for lookups in the value array. +/// +/// Supported operations: +/// NoOp: the first key inserted stores the associated value. +/// AtomicAdd: duplicate key insertions sum values together. +template +struct UnorderedMapInsertOpTypes { + using value_type = typename ValueTypeView::non_const_value_type; + struct NoOp { + KOKKOS_FUNCTION + void op(ValueTypeView, ValuesIdxType, const value_type) const {} + }; + struct AtomicAdd { + KOKKOS_FUNCTION + void op(ValueTypeView values, ValuesIdxType values_idx, + const value_type v) const { + Kokkos::atomic_add(values.data() + values_idx, v); + } + }; +}; + /// \class UnorderedMap /// \brief Thread-safe, performance-portable lookup table. /// @@ -186,7 +216,6 @@ class UnorderedMap { public: //! \name Public types and constants //@{ - // key_types using declared_key_type = Key; using key_type = std::remove_const_t; @@ -232,7 +261,6 @@ class UnorderedMap { UnorderedMap; using histogram_type = Impl::UnorderedMapHistogram; - //@} private: @@ -263,13 +291,17 @@ class UnorderedMap { public: //! \name Public member functions //@{ + using default_op_type = + typename UnorderedMapInsertOpTypes::NoOp; /// \brief Constructor /// /// \param capacity_hint [in] Initial guess of how many unique keys will be - /// inserted into the map \param hash [in] Hasher function for \c Key - /// instances. The - /// default value usually suffices. + /// inserted into the map. + /// \param hash [in] Hasher function for \c Key instances. The + /// default value usually suffices. + /// \param equal_to [in] The operator used for determining if two + /// keys are equal. UnorderedMap(size_type capacity_hint = 0, hasher_type hasher = hasher_type(), equal_to_type equal_to = equal_to_type()) : m_bounded_insert(true), @@ -442,9 +474,18 @@ class UnorderedMap { /// \param v [in] The corresponding value to attempt to insert. If /// using this class as a set (with Value = void), then you need not /// provide this value. - KOKKOS_INLINE_FUNCTION - insert_result insert(key_type const &k, - impl_value_type const &v = impl_value_type()) const { + /// \param insert_op [in] The operator used for combining values if a + /// key already exists. See + /// Kokkos::UnorderedMapInsertOpTypes for more ops. + template + KOKKOS_INLINE_FUNCTION insert_result + insert(key_type const &k, impl_value_type const &v = impl_value_type(), + [[maybe_unused]] InsertOpType arg_insert_op = InsertOpType()) const { + if constexpr (is_set) { + static_assert(std::is_same_v, + "Insert Operations are not supported on sets."); + } + insert_result result; if (!is_insertable_map || capacity() == 0u || @@ -532,6 +573,9 @@ class UnorderedMap { } result.set_existing(curr, free_existing); + if constexpr (!is_set) { + arg_insert_op.op(m_values, curr, v); + } not_done = false; } //------------------------------------------------------------ diff --git a/containers/unit_tests/TestUnorderedMap.hpp b/containers/unit_tests/TestUnorderedMap.hpp index bcaf6d77c4..977d3dd945 100644 --- a/containers/unit_tests/TestUnorderedMap.hpp +++ b/containers/unit_tests/TestUnorderedMap.hpp @@ -25,18 +25,31 @@ namespace Test { namespace Impl { -template +template struct TestInsert { using map_type = MapType; using execution_space = typename map_type::execution_space; using value_type = uint32_t; + struct ExpectedValues { + unsigned map_idx; + typename map_type::value_type v; + }; + using expected_values_type = Kokkos::View; + expected_values_type expected_values; + map_type map; uint32_t inserts; uint32_t collisions; + InsertOp insert_op; TestInsert(map_type arg_map, uint32_t arg_inserts, uint32_t arg_collisions) - : map(arg_map), inserts(arg_inserts), collisions(arg_collisions) {} + : map(arg_map), inserts(arg_inserts), collisions(arg_collisions) { + auto len = map.capacity() > arg_inserts ? map.capacity() : arg_inserts; + expected_values = expected_values_type("ExpectedValues", len); + } void testit(bool rehash_on_fail = true) { execution_space().fence(); @@ -60,6 +73,18 @@ struct TestInsert { Kokkos::deep_copy(map_h, map); execution_space().fence(); ASSERT_EQ(map_h.size(), map.size()); + + if (!rehash_on_fail && CheckValues) { + typename expected_values_type::HostMirror expected_values_h = + create_mirror_view(expected_values); + Kokkos::deep_copy(expected_values_h, expected_values); + for (unsigned i = 0; i < map_h.size(); i++) { + auto map_idx = expected_values_h(i).map_idx; + if (map_idx != static_cast(~0)) { + ASSERT_EQ(expected_values_h(map_idx).v, map_h.value_at(map_idx)); + } + } + } } KOKKOS_INLINE_FUNCTION @@ -70,10 +95,47 @@ struct TestInsert { failed_count += count; } + template + KOKKOS_FORCEINLINE_FUNCTION bool is_op_noop() const { + using vt = typename map_type::value_type; + using Device = typename map_type::device_type; + using UmapOpTypeArg1 = Kokkos::View< + std::remove_const_t, int, vt>> *, + Device>; + return std::is_base_of_v< + InsertOp, typename Kokkos::UnorderedMapInsertOpTypes::NoOp>; + } + + template + KOKKOS_FORCEINLINE_FUNCTION bool is_op_atomic_add() const { + using vt = typename map_type::value_type; + using Device = typename map_type::device_type; + using UmapOpTypeArg1 = Kokkos::View< + std::remove_const_t, int, vt>> *, + Device>; + return std::is_base_of_v::AtomicAdd>; + } + KOKKOS_INLINE_FUNCTION void operator()(uint32_t i, value_type &failed_count) const { const uint32_t key = Near ? i / collisions : i % (inserts / collisions); - if (map.insert(key, i).failed()) ++failed_count; + auto ret = map.insert(key, i, insert_op); + if (ret.failed()) { + ++failed_count; + expected_values(i).map_idx = static_cast(~0); + } else if (CheckValues) { + auto map_idx = map.find(key); + expected_values(map_idx).map_idx = map_idx; + auto ptr = expected_values.data(); + if (is_op_atomic_add()) { + Kokkos::atomic_add(&((ptr + map_idx)[0].v), i); + } else if (ret.success() && is_op_noop()) { + Kokkos::atomic_store(&((ptr + map_idx)[0].v), i); + } + } } }; @@ -154,26 +216,26 @@ struct TestFind { // MSVC reports a syntax error for this test. // WORKAROUND MSVC #ifndef _WIN32 -template +template void test_insert(uint32_t num_nodes, uint32_t num_inserts, uint32_t num_duplicates, bool near) { - using map_type = Kokkos::UnorderedMap; - using const_map_type = - Kokkos::UnorderedMap; - const uint32_t expected_inserts = (num_inserts + num_duplicates - 1u) / num_duplicates; + typename map_type::size_type arg_capacity_hint = 0; + typename map_type::hasher_type arg_hasher; + typename map_type::equal_to_type arg_equal_to; - map_type map; + map_type map(arg_capacity_hint, arg_hasher, arg_equal_to); map.rehash(num_nodes, false); if (near) { - Impl::TestInsert test_insert(map, num_inserts, - num_duplicates); + Impl::TestInsert test_insert( + map, num_inserts, num_duplicates); test_insert.testit(); } else { - Impl::TestInsert test_insert(map, num_inserts, - num_duplicates); + Impl::TestInsert test_insert( + map, num_inserts, num_duplicates); test_insert.testit(); } @@ -191,8 +253,7 @@ void test_insert(uint32_t num_nodes, uint32_t num_inserts, { uint32_t find_errors = 0; - Impl::TestFind test_find(map, num_inserts, - num_duplicates); + Impl::TestFind test_find(map, num_inserts, num_duplicates); test_find.testit(find_errors); EXPECT_EQ(0u, find_errors); } @@ -204,6 +265,64 @@ void test_insert(uint32_t num_nodes, uint32_t num_inserts, map.end_erase(); EXPECT_EQ(0u, map.size()); } + + // Check the values from the insert operation + { + Impl::TestInsert test_insert( + map, num_inserts, num_duplicates); + test_insert.testit(false); + } +} + +template +void test_inserts(uint32_t num_nodes, uint32_t num_inserts, + uint32_t num_duplicates, bool near) { + using key_type = uint32_t; + using value_type = uint32_t; + using value_view_type = Kokkos::View; + using size_type = uint32_t; + using hasher_type = typename Kokkos::pod_hash; + using equal_to_type = typename Kokkos::pod_equal_to; + + using map_op_type = + Kokkos::UnorderedMapInsertOpTypes; + using noop_type = typename map_op_type::NoOp; + + using map_type = Kokkos::UnorderedMap; + using const_map_type = + Kokkos::UnorderedMap; + + test_insert( + num_nodes, num_inserts, num_duplicates, near); +} + +template +void test_all_insert_ops(uint32_t num_nodes, uint32_t num_inserts, + uint32_t num_duplicates, bool near) { + using key_type = uint32_t; + using value_type = uint32_t; + using value_view_type = Kokkos::View; + using size_type = uint32_t; + using hasher_type = typename Kokkos::pod_hash; + using equal_to_type = typename Kokkos::pod_equal_to; + + using map_op_type = + Kokkos::UnorderedMapInsertOpTypes; + using noop_type = typename map_op_type::NoOp; + using atomic_add_type = typename map_op_type::AtomicAdd; + + using map_type = Kokkos::UnorderedMap; + using const_map_type = + Kokkos::UnorderedMap; + + test_insert( + num_nodes, num_inserts, num_duplicates, near); + test_insert( + num_nodes, num_inserts, num_duplicates, near); } #endif @@ -279,8 +398,12 @@ TEST(TEST_CATEGORY, UnorderedMap_insert) { } #endif for (int i = 0; i < 500; ++i) { - test_insert(100000, 90000, 100, true); - test_insert(100000, 90000, 100, false); + test_inserts(100000, 90000, 100, true); + test_inserts(100000, 90000, 100, false); + } + for (int i = 0; i < 5; ++i) { + test_all_insert_ops(1000, 900, 10, true); + test_all_insert_ops(1000, 900, 10, false); } } #endif From 9b06259fa22edf9cb874c78a960cef9e8dce7fef Mon Sep 17 00:00:00 2001 From: Nicolas Morales Date: Fri, 31 Mar 2023 18:15:43 -0700 Subject: [PATCH 355/496] #6027: replace remaining instances of ALL_t with Kokkos::ALL_t (#6028) * #6027: replace remaining instances of ALL_t with Kokkos::ALL_t * #6027: fix formatting --- core/src/impl/Kokkos_ViewMapping.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index fb590820d7..4df9fadc83 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -565,7 +565,8 @@ struct SubviewExtents { // std::pair range template void error(char* buf, int buf_len, unsigned domain_rank, unsigned range_rank, - const ViewDimension& dim, ALL_t, Args... args) const { + const ViewDimension& dim, Kokkos::ALL_t, + Args... args) const { const int n = std::min(buf_len, snprintf(buf, buf_len, " Kokkos::ALL %c", int(sizeof...(Args) ? ',' : ')'))); @@ -3773,8 +3774,8 @@ struct SubViewDataTypeImpl< /* for ALL slice, subview has the same dimension */ template struct SubViewDataTypeImpl, ALL_t, - Args...> + Kokkos::Experimental::Extents, + Kokkos::ALL_t, Args...> : SubViewDataTypeImpl::type, Kokkos::Experimental::Extents, Args...> {}; From ef1ea9343c9b65158f58fb987dc5e51c74e08312 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 31 Mar 2023 21:21:49 -0400 Subject: [PATCH 356/496] Add -Wdeprecated-copy warning and fix OMPT scan bug related to assignment operators (#6026) * Add -Wdeprecated-copy warning * Drop -Wdeprecated-copy flag --- .../impl/Kokkos_PartitionCopy.hpp | 19 ---------- ...Kokkos_ValueWrapperForNoNeutralElement.hpp | 6 --- containers/src/Kokkos_ScatterView.hpp | 9 +++-- core/src/Kokkos_Half.hpp | 10 +++++ core/src/Kokkos_Parallel_Reduce.hpp | 37 ------------------- ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 2 +- core/unit_test/TestTeamBasic.hpp | 3 ++ core/unit_test/TestTeamTeamSize.hpp | 4 -- .../incremental/Test14_MDRangeReduce.hpp | 7 ++++ 9 files changed, 27 insertions(+), 70 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_PartitionCopy.hpp b/algorithms/src/std_algorithms/impl/Kokkos_PartitionCopy.hpp index 5457ae2508..54f7c5b612 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_PartitionCopy.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_PartitionCopy.hpp @@ -31,25 +31,6 @@ template struct StdPartitionCopyScalar { ValueType true_count_; ValueType false_count_; - - // Here we implement the copy assignment operators explicitly for consistency - // with how the Scalar structs are implemented inside - // Kokkos_Parallel_Reduce.hpp. - KOKKOS_FUNCTION - void operator=(const StdPartitionCopyScalar& other) { - true_count_ = other.true_count_; - false_count_ = other.false_count_; - } - - // this is needed for - // OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp:699:21: error: no viable - // overloaded '=' m_returnvalue = 0; - // - KOKKOS_FUNCTION - void operator=(const ValueType value) { - true_count_ = value; - false_count_ = value; - } }; template struct ValueWrapperForNoNeutralElement { Scalar val; bool is_initial = true; - - KOKKOS_FUNCTION - void operator=(const ValueWrapperForNoNeutralElement& rhs) { - val = rhs.val; - is_initial = rhs.is_initial; - } }; } // namespace Impl diff --git a/containers/src/Kokkos_ScatterView.hpp b/containers/src/Kokkos_ScatterView.hpp index 527ab36aae..f2ef3fa162 100644 --- a/containers/src/Kokkos_ScatterView.hpp +++ b/containers/src/Kokkos_ScatterView.hpp @@ -721,10 +721,11 @@ class ScatterView - KOKKOS_FUNCTION void operator=( + KOKKOS_FUNCTION ScatterView& operator=( const ScatterView& other_view) { internal_view = other_view.internal_view; + return *this; } template @@ -942,11 +943,12 @@ class ScatterView - KOKKOS_FUNCTION void operator=( + KOKKOS_FUNCTION ScatterView& operator=( const ScatterView& other_view) { unique_token = other_view.unique_token; internal_view = other_view.internal_view; + return *this; } template @@ -1278,11 +1280,12 @@ class ScatterView - KOKKOS_FUNCTION void operator=( + KOKKOS_FUNCTION ScatterView& operator=( const ScatterView& other_view) { unique_token = other_view.unique_token; internal_view = other_view.internal_view; + return *this; } template diff --git a/core/src/Kokkos_Half.hpp b/core/src/Kokkos_Half.hpp index 82dd55549d..48d1e647e6 100644 --- a/core/src/Kokkos_Half.hpp +++ b/core/src/Kokkos_Half.hpp @@ -228,9 +228,19 @@ class alignas(FloatType) floating_point_wrapper { #if defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA) KOKKOS_FUNCTION floating_point_wrapper(const floating_point_wrapper& rhs) : val(rhs.val) {} + + KOKKOS_FUNCTION + floating_point_wrapper& operator=(const floating_point_wrapper& rhs) { + val = rhs.val; + return *this; + } #else KOKKOS_DEFAULTED_FUNCTION floating_point_wrapper(const floating_point_wrapper&) noexcept = default; + + KOKKOS_DEFAULTED_FUNCTION + floating_point_wrapper& operator=(const floating_point_wrapper&) noexcept = + default; #endif KOKKOS_INLINE_FUNCTION diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index 7fe539c4c6..fa4b401ce0 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -407,12 +407,6 @@ template struct ValLocScalar { Scalar val; Index loc; - - KOKKOS_INLINE_FUNCTION - void operator=(const ValLocScalar& rhs) { - val = rhs.val; - loc = rhs.loc; - } }; template @@ -530,12 +524,6 @@ MaxLoc(View, Properties...> const&) template struct MinMaxScalar { Scalar min_val, max_val; - - KOKKOS_INLINE_FUNCTION - void operator=(const MinMaxScalar& rhs) { - min_val = rhs.min_val; - max_val = rhs.max_val; - } }; template @@ -600,14 +588,6 @@ template struct MinMaxLocScalar { Scalar min_val, max_val; Index min_loc, max_loc; - - KOKKOS_INLINE_FUNCTION - void operator=(const MinMaxLocScalar& rhs) { - min_val = rhs.min_val; - min_loc = rhs.min_loc; - max_val = rhs.max_val; - max_loc = rhs.max_loc; - } }; template @@ -1106,9 +1086,6 @@ MinMaxFirstLastLocCustomComparator( template struct FirstLocScalar { Index min_loc_true; - - KOKKOS_INLINE_FUNCTION - void operator=(const FirstLocScalar& rhs) { min_loc_true = rhs.min_loc_true; } }; template @@ -1170,9 +1147,6 @@ FirstLoc(View, Properties...> const&) template struct LastLocScalar { Index max_loc_true; - - KOKKOS_INLINE_FUNCTION - void operator=(const LastLocScalar& rhs) { max_loc_true = rhs.max_loc_true; } }; template @@ -1231,12 +1205,6 @@ LastLoc(View, Properties...> const&) template struct StdIsPartScalar { Index max_loc_true, min_loc_false; - - KOKKOS_INLINE_FUNCTION - void operator=(const StdIsPartScalar& rhs) { - min_loc_false = rhs.min_loc_false; - max_loc_true = rhs.max_loc_true; - } }; // @@ -1304,11 +1272,6 @@ StdIsPartitioned(View, Properties...> const&) template struct StdPartPointScalar { Index min_loc_false; - - KOKKOS_INLINE_FUNCTION - void operator=(const StdPartPointScalar& rhs) { - min_loc_false = rhs.min_loc_false; - } }; // diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index dd5aa0878e..f95a4610d9 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -241,7 +241,7 @@ class ParallelScanWithTotal, base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size); } } else if (!base_t::m_result_ptr_device_accessible) { - *base_t::m_result_ptr = 0; + base_t::m_functor_reducer.get_reducer().init(base_t::m_result_ptr); } } diff --git a/core/unit_test/TestTeamBasic.hpp b/core/unit_test/TestTeamBasic.hpp index 5ee8629656..70aa34d22a 100644 --- a/core/unit_test/TestTeamBasic.hpp +++ b/core/unit_test/TestTeamBasic.hpp @@ -247,6 +247,9 @@ struct long_wrapper { KOKKOS_FUNCTION long_wrapper(long val) : value(val) {} + KOKKOS_FUNCTION + long_wrapper(const long_wrapper& val) : value(val.value) {} + KOKKOS_FUNCTION friend void operator+=(long_wrapper& lhs, const long_wrapper& rhs) { lhs.value += rhs.value; diff --git a/core/unit_test/TestTeamTeamSize.hpp b/core/unit_test/TestTeamTeamSize.hpp index 7a6a4dd581..b4304fc2eb 100644 --- a/core/unit_test/TestTeamTeamSize.hpp +++ b/core/unit_test/TestTeamTeamSize.hpp @@ -31,10 +31,6 @@ class MyArray { void operator+=(const MyArray& src) { for (int i = 0; i < N; i++) values[i] += src.values[i]; } - KOKKOS_INLINE_FUNCTION - void operator=(const MyArray& src) { - for (int i = 0; i < N; i++) values[i] = src.values[i]; - } }; template diff --git a/core/unit_test/incremental/Test14_MDRangeReduce.hpp b/core/unit_test/incremental/Test14_MDRangeReduce.hpp index deffe88313..f79d4f8ea3 100644 --- a/core/unit_test/incremental/Test14_MDRangeReduce.hpp +++ b/core/unit_test/incremental/Test14_MDRangeReduce.hpp @@ -38,6 +38,13 @@ struct MyComplex { KOKKOS_INLINE_FUNCTION MyComplex(const MyComplex& src) : _re(src._re), _im(src._im) {} + KOKKOS_INLINE_FUNCTION + MyComplex& operator=(const MyComplex& src) { + _re = src._re; + _im = src._im; + return *this; + } + KOKKOS_INLINE_FUNCTION void operator+=(const MyComplex& src) { _re += src._re; From 5e574380cbca98ec45d92f62fe861d38aab0d825 Mon Sep 17 00:00:00 2001 From: Nick Curtis Date: Mon, 3 Apr 2023 10:06:48 -0400 Subject: [PATCH 357/496] Relax scratch space limits for HIP reductions (#6029) * relax scratch space limits for reductions Change-Id: I66c73cb3e53b99d8d0b59f423ae77263ca7e176f * apply clang-format Change-Id: I1400b8e9bc8c0a5af237283e7fc4db641d05b3f8 --------- Co-authored-by: Nicholas Curtis --- core/src/HIP/Kokkos_HIP_Parallel_Range.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index 757250ad4d..442e794aa9 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -294,18 +294,18 @@ class ParallelReduce, "valid execution configuration.")); } - m_scratch_space = ::Kokkos::Impl::hip_internal_scratch_space( - m_policy.space(), reducer.value_size() * - block_size /* block_size == max block_count */); - m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( - m_policy.space(), sizeof(size_type)); - // REQUIRED ( 1 , N , 1 ) dim3 block(1, block_size, 1); + // use a slightly less constrained, but still well bounded limit for + // scratch + uint32_t nblocks = static_cast((nwork + block.y - 1) / block.y); + nblocks = std::min(nblocks, 4096u); + m_scratch_space = ::Kokkos::Impl::hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * nblocks); + m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( + m_policy.space(), sizeof(size_type)); // Required grid.x <= block.y - dim3 grid(std::min(block.y, static_cast((nwork + block.y - 1) / - block.y)), - 1, 1); + dim3 grid(nblocks, 1, 1); if (nwork == 0) { block = dim3(1, 1, 1); From 0ce389590d68bb801939d64229fbe8a0720c7ad1 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 27 Mar 2023 21:32:35 -0400 Subject: [PATCH 358/496] Fix -Wmissing-field-initializers warning --- core/src/impl/Kokkos_Profiling.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/impl/Kokkos_Profiling.cpp b/core/src/impl/Kokkos_Profiling.cpp index e3cfcb6a29..46f699ac8c 100644 --- a/core/src/impl/Kokkos_Profiling.cpp +++ b/core/src/impl/Kokkos_Profiling.cpp @@ -177,7 +177,8 @@ Kokkos::Tools::Impl::InitializationStatus parse_environment_variables( args = env_tools_args; } return { - Kokkos::Tools::Impl::InitializationStatus::InitializationResult::success}; + Kokkos::Tools::Impl::InitializationStatus::InitializationResult::success, + ""}; } InitializationStatus initialize_tools_subsystem( const Kokkos::Tools::InitArguments& args) { @@ -192,13 +193,13 @@ InitializationStatus initialize_tools_subsystem( if (!Kokkos::Tools::printHelp(final_args)) { std::cerr << "Tool has not provided a help message" << std::endl; } - return {InitializationStatus::InitializationResult::help_request}; + return {InitializationStatus::InitializationResult::help_request, ""}; } Kokkos::Tools::parseArgs(final_args); #else (void)args; #endif - return {InitializationStatus::InitializationResult::success}; + return {InitializationStatus::InitializationResult::success, ""}; } } // namespace Impl From be65fe4298756153373dc1138ea22b43721f4cb5 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 4 Apr 2023 15:54:10 -0400 Subject: [PATCH 359/496] Fix enum warnings --- containers/src/Kokkos_OffsetView.hpp | 6 +++--- core/src/Kokkos_View.hpp | 4 ++-- core/src/impl/Kokkos_ViewArray.hpp | 2 +- core/src/impl/Kokkos_ViewMapping.hpp | 32 +++++++++++++++------------- core/unit_test/TestTaskScheduler.hpp | 6 +++--- 5 files changed, 26 insertions(+), 24 deletions(-) diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 9716e4e0f4..22b65f3f9f 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -938,10 +938,10 @@ class OffsetView : public ViewTraits { ")" "\n"; - // If there are no errors so far, then rank == Rank + // If there are no errors so far, then arg_rank == Rank // Otherwise, check as much as possible - size_t rank = begins.size() < ends.size() ? begins.size() : ends.size(); - for (size_t i = 0; i != rank; ++i) { + size_t arg_rank = begins.size() < ends.size() ? begins.size() : ends.size(); + for (size_t i = 0; i != arg_rank; ++i) { subtraction_failure sf = check_subtraction(at(ends, i), at(begins, i)); if (sf != subtraction_failure::none) { message += diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 1e399f9c59..ba1cd3583f 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -316,8 +316,8 @@ struct ViewTraits { typename prop::specialize, typename data_analysis::specialize>; /* mapping specialization tag */ - enum { rank = dimension::rank }; - enum { rank_dynamic = dimension::rank_dynamic }; + static constexpr unsigned rank = dimension::rank; + static constexpr unsigned rank_dynamic = dimension::rank_dynamic; //------------------------------------ // Execution space, memory space, memory access traits, and host mirror space. diff --git a/core/src/impl/Kokkos_ViewArray.hpp b/core/src/impl/Kokkos_ViewArray.hpp index c76bde4993..725ba5de09 100644 --- a/core/src/impl/Kokkos_ViewArray.hpp +++ b/core/src/impl/Kokkos_ViewArray.hpp @@ -123,7 +123,7 @@ class ViewMapping> { //---------------------------------------- // Domain dimensions - enum { Rank = Traits::dimension::rank }; + static constexpr unsigned Rank = Traits::dimension::rank; template KOKKOS_INLINE_FUNCTION constexpr size_t extent(const iType &r) const { diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 4df9fadc83..232a05bbc4 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -159,8 +159,8 @@ struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension using D6::N6; using D7::N7; - enum : unsigned { rank = sizeof...(Vals) }; - enum : unsigned { rank_dynamic = Impl::rank_dynamic::value }; + static constexpr unsigned rank = sizeof...(Vals); + static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; ViewDimension() = default; ViewDimension(const ViewDimension&) = default; @@ -2189,7 +2189,8 @@ struct ViewStride; template <> struct ViewStride<0> { - enum { S0 = 0, S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0 }; + static constexpr size_t S0 = 0, S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, + S6 = 0, S7 = 0; ViewStride() = default; ViewStride(const ViewStride&) = default; @@ -2203,7 +2204,8 @@ struct ViewStride<0> { template <> struct ViewStride<1> { size_t S0; - enum { S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0 }; + static constexpr size_t S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, + S7 = 0; ViewStride() = default; ViewStride(const ViewStride&) = default; @@ -2218,7 +2220,7 @@ struct ViewStride<1> { template <> struct ViewStride<2> { size_t S0, S1; - enum { S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0 }; + static constexpr size_t S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; ViewStride() = default; ViewStride(const ViewStride&) = default; @@ -2233,7 +2235,7 @@ struct ViewStride<2> { template <> struct ViewStride<3> { size_t S0, S1, S2; - enum { S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0 }; + static constexpr size_t S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; ViewStride() = default; ViewStride(const ViewStride&) = default; @@ -2248,7 +2250,7 @@ struct ViewStride<3> { template <> struct ViewStride<4> { size_t S0, S1, S2, S3; - enum { S4 = 0, S5 = 0, S6 = 0, S7 = 0 }; + static constexpr size_t S4 = 0, S5 = 0, S6 = 0, S7 = 0; ViewStride() = default; ViewStride(const ViewStride&) = default; @@ -2263,7 +2265,7 @@ struct ViewStride<4> { template <> struct ViewStride<5> { size_t S0, S1, S2, S3, S4; - enum { S5 = 0, S6 = 0, S7 = 0 }; + static constexpr size_t S5 = 0, S6 = 0, S7 = 0; ViewStride() = default; ViewStride(const ViewStride&) = default; @@ -2278,7 +2280,7 @@ struct ViewStride<5> { template <> struct ViewStride<6> { size_t S0, S1, S2, S3, S4, S5; - enum { S6 = 0, S7 = 0 }; + static constexpr size_t S6 = 0, S7 = 0; ViewStride() = default; ViewStride(const ViewStride&) = default; @@ -2293,7 +2295,7 @@ struct ViewStride<6> { template <> struct ViewStride<7> { size_t S0, S1, S2, S3, S4, S5, S6; - enum { S7 = 0 }; + static constexpr size_t S7 = 0; ViewStride() = default; ViewStride(const ViewStride&) = default; @@ -3146,7 +3148,7 @@ class ViewMapping< //---------------------------------------- // Domain dimensions - enum { Rank = Traits::dimension::rank }; + static constexpr unsigned Rank = Traits::dimension::rank; template KOKKOS_INLINE_FUNCTION constexpr size_t extent(const iType& r) const { @@ -3662,7 +3664,7 @@ class ViewMapping< size_t exp_stride = 1; if (std::is_same::value) { - for (int i = 0; i < src.Rank; i++) { + for (unsigned int i = 0; i < src.Rank; i++) { if (i > 0) exp_stride *= src.extent(i - 1); if (strides[i] != exp_stride) { assignable = false; @@ -3671,9 +3673,9 @@ class ViewMapping< } } else if (std::is_same::value) { - for (int i = src.Rank - 1; i >= 0; i--) { - if (i < src.Rank - 1) exp_stride *= src.extent(i + 1); - if (strides[i] != exp_stride) { + for (unsigned int i = 0; i < src.Rank; i++) { + if (i > 0) exp_stride *= src.extent(src.Rank - i); + if (strides[src.Rank - 1 - i] != exp_stride) { assignable = false; break; } diff --git a/core/unit_test/TestTaskScheduler.hpp b/core/unit_test/TestTaskScheduler.hpp index 5a0394f6c1..1d72662ad2 100644 --- a/core/unit_test/TestTaskScheduler.hpp +++ b/core/unit_test/TestTaskScheduler.hpp @@ -170,9 +170,9 @@ struct TestTaskDependence { KOKKOS_INLINE_FUNCTION void operator()(typename sched_type::member_type& member) { - auto& sched = member.scheduler(); - enum { CHUNK = 8 }; - const int n = CHUNK < m_count ? CHUNK : m_count; + auto& sched = member.scheduler(); + static constexpr int CHUNK = 8; + const int n = CHUNK < m_count ? CHUNK : m_count; if (1 < m_count) { const int increment = (m_count + n - 1) / n; From 9b644e07a2fec6bf13e24face244454b737fb110 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 4 Apr 2023 15:54:35 -0400 Subject: [PATCH 360/496] Fix OMPT size compare warnings --- cmake/kokkos_arch.cmake | 5 ----- core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp | 2 +- core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp | 2 +- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 3e1e448c76..7c7f9ae98e 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -129,11 +129,6 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS) LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic") ENDIF() - # OpenMPTarget compilers give erroneous warnings about sign comparison in loops - IF(KOKKOS_ENABLE_OPENMPTARGET) - LIST(REMOVE_ITEM COMMON_WARNINGS "-Wsign-compare") - ENDIF() - # NVHPC compiler does not support -Wtype-limits. IF(KOKKOS_ENABLE_OPENACC) IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp index 40da73ebc6..45b8f42f17 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp @@ -72,7 +72,7 @@ void OpenMPTargetExec::verify_initialized(const char* const label) { void* OpenMPTargetExec::m_scratch_ptr = nullptr; int64_t OpenMPTargetExec::m_scratch_size = 0; int* OpenMPTargetExec::m_lock_array = nullptr; -int64_t OpenMPTargetExec::m_lock_size = 0; +uint64_t OpenMPTargetExec::m_lock_size = 0; uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; void OpenMPTargetExec::clear_scratch() { diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 5e898727f1..50167e297b 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -748,7 +748,7 @@ class OpenMPTargetExec { static void* m_scratch_ptr; static int64_t m_scratch_size; static int* m_lock_array; - static int64_t m_lock_size; + static uint64_t m_lock_size; static uint32_t* m_uniquetoken_ptr; }; From b57c17bb288a39e203f67cc27af568c77157fe07 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 4 Apr 2023 15:54:49 -0400 Subject: [PATCH 361/496] Add -Wextra --- cmake/kokkos_arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 7c7f9ae98e..2ed5d1c610 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -120,7 +120,7 @@ KOKKOS_ARCH_OPTION(INTEL_PVC GPU "Intel GPU Ponte Vecchio" IF(KOKKOS_ENABLE_COMPILER_WARNINGS) SET(COMMON_WARNINGS - "-Wall" "-Wunused-parameter" "-Wshadow" "-pedantic" + "-Wall" "-Wextra" "-Wunused-parameter" "-Wshadow" "-pedantic" "-Wsign-compare" "-Wtype-limits" "-Wuninitialized") # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH From c9a9ee0d521e79760cd081c30f96ca64c7679efd Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 5 Apr 2023 14:34:08 -0400 Subject: [PATCH 362/496] Cherry-pick TriBITS update from Trilinos --- CMakeLists.txt | 11 +++++++-- cmake/kokkos_functions.cmake | 9 ++++++++ cmake/kokkos_tribits.cmake | 43 ++++++++++++++---------------------- 3 files changed, 35 insertions(+), 28 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ac0dbe70dd..c82910708a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,13 +5,16 @@ if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" ) message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." ) endif() +if (COMMAND TRIBITS_PACKAGE_DECL) + TRIBITS_PACKAGE_DECL(Kokkos) +endif() + # We want to determine if options are given with the wrong case # In order to detect which arguments are given to compare against # the list of valid arguments, at the beginning here we need to # form a list of all the given variables. If it begins with any # case of KoKkOS, we add it to the list. - GET_CMAKE_PROPERTY(_variableNames VARIABLES) SET(KOKKOS_GIVEN_VARIABLES) FOREACH (var ${_variableNames}) @@ -123,6 +126,8 @@ IF(NOT KOKKOS_HAS_TRILINOS) FORCE) ENDIF() ENDIF() +ELSE() + SET(KOKKOS_COMPILE_LANGUAGE CXX) ENDIF() IF (NOT CMAKE_SIZEOF_VOID_P) @@ -289,7 +294,9 @@ IF (KOKKOS_HAS_TRILINOS) $<$:${KOKKOS_ALL_COMPILE_OPTIONS}>) ENDIF() -KOKKOS_PACKAGE_DECL() +if (NOT COMMAND TRIBITS_PACKAGE_DECL) + KOKKOS_PACKAGE_DECL() +endif() #------------------------------------------------------------------------------ diff --git a/cmake/kokkos_functions.cmake b/cmake/kokkos_functions.cmake index 4c51bdeabf..55b1ebbf81 100644 --- a/cmake/kokkos_functions.cmake +++ b/cmake/kokkos_functions.cmake @@ -5,6 +5,9 @@ # Validate options are given with correct case and define an internal # upper-case version for use within +set(Kokkos_OPTIONS_NOT_TO_EXPORT + Kokkos_ENABLE_TESTS Kokkos_ENABLE_EXAMPLES) + # # # @FUNCTION: kokkos_deprecated_list @@ -57,6 +60,12 @@ FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) # Make sure this appears in the cache with the appropriate DOCSTRING SET(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) + IF (KOKKOS_HAS_TRILINOS) + IF (NOT CAMEL_NAME IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) + TRIBITS_PKG_EXPORT_CACHE_VAR(${CAMEL_NAME}) + ENDIF() + ENDIF() + #I don't love doing it this way because it's N^2 in number options, but c'est la vie FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) STRING(TOUPPER ${opt} OPT_UC) diff --git a/cmake/kokkos_tribits.cmake b/cmake/kokkos_tribits.cmake index 0557db2098..0f39551423 100644 --- a/cmake/kokkos_tribits.cmake +++ b/cmake/kokkos_tribits.cmake @@ -353,6 +353,7 @@ MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) DESTINATION ${KOKKOS_HEADER_DIR}) ENDMACRO() + FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) CMAKE_PARSE_ARGUMENTS(PARSE "PLAIN_STYLE" @@ -441,6 +442,7 @@ FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) ENDIF() ENDFUNCTION() + FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) CMAKE_PARSE_ARGUMENTS(PARSE "STATIC;SHARED" @@ -503,19 +505,11 @@ FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME) # preserving the directory structure, e.g. impl # If headers got installed in both locations, it breaks some # downstream packages - TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS}) - #Stolen from Tribits - it can add prefixes - SET(TRIBITS_LIBRARY_NAME_PREFIX "${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}") - SET(TRIBITS_LIBRARY_NAME ${TRIBITS_LIBRARY_NAME_PREFIX}${LIBRARY_NAME}) - #Tribits has way too much techinical debt and baggage to even - #allow PUBLIC target_compile_options to be used. It forces C++ flags on projects - #as a giant blob of space-separated strings. We end up with duplicated - #flags between the flags implicitly forced on Kokkos-dependent and those Kokkos - #has in its public INTERFACE_COMPILE_OPTIONS. - #These do NOT get de-deduplicated because Tribits - #creates flags as a giant monolithic space-separated string - #Do not set any transitive properties and keep everything working as before - #KOKKOS_SET_LIBRARY_PROPERTIES(${TRIBITS_LIBRARY_NAME} PLAIN_STYLE) + TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} + ADDED_LIB_TARGET_NAME_OUT ${LIBRARY_NAME}_TARGET_NAME ) + IF (PARSE_ADD_BUILD_OPTIONS) + KOKKOS_SET_LIBRARY_PROPERTIES(${${LIBRARY_NAME}_TARGET_NAME}) + ENDIF() ELSE() # Forward the headers, we want to know about all headers # to make sure they appear correctly in IDEs @@ -527,15 +521,17 @@ FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME) ENDIF() ENDFUNCTION() + FUNCTION(KOKKOS_ADD_INTERFACE_LIBRARY NAME) -IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN}) -ELSE() - ADD_LIBRARY(${NAME} INTERFACE) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME}) -ENDIF() + IF (KOKKOS_HAS_TRILINOS) + TRIBITS_ADD_LIBRARY(${NAME} ${ARGN}) + ELSE() + ADD_LIBRARY(${NAME} INTERFACE) + KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME}) + ENDIF() ENDFUNCTION() + FUNCTION(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) IF(KOKKOS_HAS_TRILINOS) #ignore the target, tribits doesn't do anything directly with targets @@ -549,13 +545,8 @@ FUNCTION(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) ENDFUNCTION() FUNCTION(KOKKOS_LIB_COMPILE_OPTIONS TARGET) - IF(KOKKOS_HAS_TRILINOS) - #don't trust tribits to do this correctly - KOKKOS_TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) - ELSE() - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - KOKKOS_TARGET_COMPILE_OPTIONS(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) - ENDIF() + KOKKOS_LIB_TYPE(${TARGET} INCTYPE) + KOKKOS_TARGET_COMPILE_OPTIONS(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) ENDFUNCTION() MACRO(KOKKOS_ADD_TEST_DIRECTORIES) From 39c35a85c51821d45d9e342f12d2d99d26268dbb Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 6 Apr 2023 08:34:45 -0400 Subject: [PATCH 363/496] KOKKOS_COMPILER_PGI -> KOKKOS_COMPILER_NVHPC --- core/src/Kokkos_Macros.hpp | 6 +++--- core/src/impl/Kokkos_Core.cpp | 8 ++++---- core/src/impl/Kokkos_TaskNode.hpp | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 00f4686ad8..2c02ef8fd6 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -283,7 +283,7 @@ //---------------------------------------------------------------------------- -#if defined(KOKKOS_COMPILER_PGI) +#if defined(KOKKOS_COMPILER_NVHPC) #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 @@ -587,8 +587,8 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] -#if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ - defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_PGI)) && \ +#if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ + defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_NVHPC)) && \ !defined(_WIN32) && !defined(__ANDROID__) #if __has_include() #define KOKKOS_IMPL_ENABLE_STACKTRACE diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index ce36fabe9b..36d8640dbc 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -505,9 +505,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { std::to_string(KOKKOS_COMPILER_NVCC)); declare_configuration_metadata("tools_only", "compiler_family", "nvcc"); #endif -#ifdef KOKKOS_COMPILER_PGI - declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_PGI", - std::to_string(KOKKOS_COMPILER_PGI)); +#ifdef KOKKOS_COMPILER_NVHPC + declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_NVHPC", + std::to_string(KOKKOS_COMPILER_NVHPC)); declare_configuration_metadata("tools_only", "compiler_family", "pgi"); #endif #ifdef KOKKOS_COMPILER_MSVC @@ -1256,7 +1256,7 @@ bool Kokkos::tune_internals() noexcept { return g_tune_internals; } namespace Kokkos { -#ifdef KOKKOS_COMPILER_PGI +#ifdef KOKKOS_COMPILER_NVHPC namespace Impl { // Bizzarely, an extra jump instruction forces the PGI compiler to not have a // bug related to (probably?) empty base optimization and/or aggregate diff --git a/core/src/impl/Kokkos_TaskNode.hpp b/core/src/impl/Kokkos_TaskNode.hpp index 81c874b5d9..9694d72f3c 100644 --- a/core/src/impl/Kokkos_TaskNode.hpp +++ b/core/src/impl/Kokkos_TaskNode.hpp @@ -42,7 +42,7 @@ namespace Kokkos { namespace Impl { -#ifdef KOKKOS_COMPILER_PGI +#ifdef KOKKOS_COMPILER_NVHPC // Bizzarely, an extra jump instruction forces the PGI compiler to not have a // bug related to (probably?) empty base optimization and/or aggregate // construction. This must be defined out-of-line to generate a jump @@ -101,7 +101,7 @@ class ReferenceCountedBase { public: KOKKOS_INLINE_FUNCTION -#ifndef KOKKOS_COMPILER_PGI +#ifndef KOKKOS_COMPILER_NVHPC constexpr #endif explicit ReferenceCountedBase( @@ -109,7 +109,7 @@ class ReferenceCountedBase { : m_ref_count(initial_reference_count) { // This can't be here because it breaks constexpr // KOKKOS_EXPECTS(initial_reference_count > 0); -#ifdef KOKKOS_COMPILER_PGI +#ifdef KOKKOS_COMPILER_NVHPC Impl::_kokkos_pgi_compiler_bug_workaround(); #endif } From 29826df6458707967ed8d34e66873f1847c8889e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 6 Apr 2023 08:48:09 -0400 Subject: [PATCH 364/496] Try removing _kokkos_pgi_compiler_bug_workaround --- core/src/impl/Kokkos_Core.cpp | 12 ------------ core/src/impl/Kokkos_TaskNode.hpp | 18 ++---------------- 2 files changed, 2 insertions(+), 28 deletions(-) diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 36d8640dbc..9b685ba5be 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -1253,15 +1253,3 @@ void Kokkos::print_configuration(std::ostream& os, bool verbose) { bool Kokkos::show_warnings() noexcept { return g_show_warnings; } bool Kokkos::tune_internals() noexcept { return g_tune_internals; } - -namespace Kokkos { - -#ifdef KOKKOS_COMPILER_NVHPC -namespace Impl { -// Bizzarely, an extra jump instruction forces the PGI compiler to not have a -// bug related to (probably?) empty base optimization and/or aggregate -// construction. -void _kokkos_pgi_compiler_bug_workaround() {} -} // end namespace Impl -#endif -} // namespace Kokkos diff --git a/core/src/impl/Kokkos_TaskNode.hpp b/core/src/impl/Kokkos_TaskNode.hpp index 9694d72f3c..a81f298bbf 100644 --- a/core/src/impl/Kokkos_TaskNode.hpp +++ b/core/src/impl/Kokkos_TaskNode.hpp @@ -42,14 +42,6 @@ namespace Kokkos { namespace Impl { -#ifdef KOKKOS_COMPILER_NVHPC -// Bizzarely, an extra jump instruction forces the PGI compiler to not have a -// bug related to (probably?) empty base optimization and/or aggregate -// construction. This must be defined out-of-line to generate a jump -// jump instruction -void _kokkos_pgi_compiler_bug_workaround(); -#endif - enum TaskType : int16_t { TaskTeam = 0, TaskSingle = 1, @@ -101,17 +93,11 @@ class ReferenceCountedBase { public: KOKKOS_INLINE_FUNCTION -#ifndef KOKKOS_COMPILER_NVHPC - constexpr -#endif - explicit ReferenceCountedBase( - reference_count_size_type initial_reference_count) + constexpr explicit ReferenceCountedBase( + reference_count_size_type initial_reference_count) : m_ref_count(initial_reference_count) { // This can't be here because it breaks constexpr // KOKKOS_EXPECTS(initial_reference_count > 0); -#ifdef KOKKOS_COMPILER_NVHPC - Impl::_kokkos_pgi_compiler_bug_workaround(); -#endif } /** Decrement the reference count, From 0126dcb56a0b2504845aa4ff4e741a7e35b50509 Mon Sep 17 00:00:00 2001 From: Dong Hun Lee Date: Fri, 7 Apr 2023 09:07:43 -0600 Subject: [PATCH 365/496] Remove unused constructors for ThreadVectorRangeBoundairesStruct that are not taking in TeamMemberType as an argument. --- core/src/Cuda/Kokkos_Cuda_Team.hpp | 8 -------- core/src/HIP/Kokkos_HIP_Team.hpp | 8 -------- core/src/Kokkos_ExecPolicy.hpp | 9 --------- 3 files changed, 25 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Team.hpp index f726cdc3fb..af47dfff92 100644 --- a/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -376,18 +376,10 @@ struct ThreadVectorRangeBoundariesStruct { ThreadVectorRangeBoundariesStruct(const CudaTeamMember, index_type count) : start(static_cast(0)), end(count) {} - KOKKOS_INLINE_FUNCTION - ThreadVectorRangeBoundariesStruct(index_type count) - : start(static_cast(0)), end(count) {} - KOKKOS_INLINE_FUNCTION ThreadVectorRangeBoundariesStruct(const CudaTeamMember, index_type arg_begin, index_type arg_end) : start(arg_begin), end(arg_end) {} - - KOKKOS_INLINE_FUNCTION - ThreadVectorRangeBoundariesStruct(index_type arg_begin, index_type arg_end) - : start(arg_begin), end(arg_end) {} }; } // namespace Impl diff --git a/core/src/HIP/Kokkos_HIP_Team.hpp b/core/src/HIP/Kokkos_HIP_Team.hpp index 197c8c1882..584d728d94 100644 --- a/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_Team.hpp @@ -367,18 +367,10 @@ struct ThreadVectorRangeBoundariesStruct { ThreadVectorRangeBoundariesStruct(const HIPTeamMember, index_type count) : start(static_cast(0)), end(count) {} - KOKKOS_INLINE_FUNCTION - ThreadVectorRangeBoundariesStruct(index_type count) - : start(static_cast(0)), end(count) {} - KOKKOS_INLINE_FUNCTION ThreadVectorRangeBoundariesStruct(const HIPTeamMember, index_type arg_begin, index_type arg_end) : start(arg_begin), end(arg_end) {} - - KOKKOS_INLINE_FUNCTION - ThreadVectorRangeBoundariesStruct(index_type arg_begin, index_type arg_end) - : start(arg_begin), end(arg_end) {} }; } // namespace Impl diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index b141d7c692..611fd6512b 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -738,20 +738,11 @@ struct ThreadVectorRangeBoundariesStruct { const index_type& count) noexcept : start(static_cast(0)), end(count) {} - KOKKOS_INLINE_FUNCTION - constexpr ThreadVectorRangeBoundariesStruct(const index_type& count) noexcept - : start(static_cast(0)), end(count) {} - KOKKOS_INLINE_FUNCTION constexpr ThreadVectorRangeBoundariesStruct( const TeamMemberType, const index_type& arg_begin, const index_type& arg_end) noexcept : start(static_cast(arg_begin)), end(arg_end) {} - - KOKKOS_INLINE_FUNCTION - constexpr ThreadVectorRangeBoundariesStruct( - const index_type& arg_begin, const index_type& arg_end) noexcept - : start(static_cast(arg_begin)), end(arg_end) {} }; template From 140cbd72c877249e1e03e2ea6e04204569d1694c Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Sat, 8 Apr 2023 14:22:06 -0400 Subject: [PATCH 366/496] Define at most one KOKKOS_COMPILER* macro --- core/src/Kokkos_Macros.hpp | 45 +++++++++---------- ...pilerMacros.hpp => TestCompilerMacros.cpp} | 19 +++++--- 2 files changed, 34 insertions(+), 30 deletions(-) rename core/unit_test/{TestCompilerMacros.hpp => TestCompilerMacros.cpp} (75%) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 2c02ef8fd6..5a44391982 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -123,46 +123,41 @@ #if defined(__INTEL_COMPILER) #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER + #elif defined(__INTEL_LLVM_COMPILER) #define KOKKOS_COMPILER_INTEL __INTEL_LLVM_COMPILER -#elif defined(__ICC) -// Old define -#define KOKKOS_COMPILER_INTEL __ICC -#elif defined(__ECC) -// Very old define -#define KOKKOS_COMPILER_INTEL __ECC -#endif +#elif defined(_CRAYC) // CRAY compiler for host code -#if defined(_CRAYC) #define KOKKOS_COMPILER_CRAYC _CRAYC -#endif -#if defined(__APPLE_CC__) +#elif defined(__APPLE_CC__) #define KOKKOS_COMPILER_APPLECC __APPLE_CC__ -#endif -#if defined(__clang__) && !defined(KOKKOS_COMPILER_INTEL) +#elif defined(__NVCOMPILER) +#define KOKKOS_COMPILER_NVHPC \ + __NVCOMPILER_MAJOR__ * 10000 + __NVCOMPILER_MINOR__ * 100 + \ + __NVCOMPILER_PATCHLEVEL__ + +#elif defined(__clang__) +// Check this after the Clang-based proprietary compilers which will also define +// __clang__ #define KOKKOS_COMPILER_CLANG \ __clang_major__ * 100 + __clang_minor__ * 10 + __clang_patchlevel__ -#endif -#if !defined(__clang__) && !defined(KOKKOS_COMPILER_INTEL) && defined(__GNUC__) +#elif defined(__GNUC__) +// Check this here because many compilers (at least Clang variants and Intel +// classic) define `__GNUC__` for compatibility #define KOKKOS_COMPILER_GNU \ __GNUC__ * 100 + __GNUC_MINOR__ * 10 + __GNUC_PATCHLEVEL__ -#if (530 > KOKKOS_COMPILER_GNU) -#error "Compiling with GCC version earlier than 5.3.0 is not supported." -#endif -#endif - -#if defined(__NVCOMPILER) -#define KOKKOS_COMPILER_NVHPC \ - __NVCOMPILER_MAJOR__ * 10000 + __NVCOMPILER_MINOR__ * 100 + \ - __NVCOMPILER_PATCHLEVEL__ +#if (820 > KOKKOS_COMPILER_GNU) +#error "Compiling with GCC version earlier than 8.2.0 is not supported." #endif -#if defined(_MSC_VER) && !defined(KOKKOS_COMPILER_INTEL) +#elif defined(_MSC_VER) +// Check this after Intel and Clang because those define _MSC_VER for +// compatibility #define KOKKOS_COMPILER_MSVC _MSC_VER #endif @@ -207,7 +202,7 @@ #endif #endif -#if (1900 > KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_COMPILER_INTEL) && (1900 > KOKKOS_COMPILER_INTEL) #error "Compiling with Intel version earlier than 19.0.5 is not supported." #endif diff --git a/core/unit_test/TestCompilerMacros.hpp b/core/unit_test/TestCompilerMacros.cpp similarity index 75% rename from core/unit_test/TestCompilerMacros.hpp rename to core/unit_test/TestCompilerMacros.cpp index 9d22c4b0a7..e719ad83fb 100644 --- a/core/unit_test/TestCompilerMacros.hpp +++ b/core/unit_test/TestCompilerMacros.cpp @@ -14,8 +14,19 @@ // //@HEADER +#include #include +#if 1 != ((defined(KOKKOS_COMPILER_INTEL) ? 1 : 0) + \ + (defined(KOKKOS_COMPILER_CRAYC) ? 1 : 0) + \ + (defined(KOKKOS_COMPILER_APPLECC) ? 1 : 0) + \ + (defined(KOKKOS_COMPILER_CLANG) ? 1 : 0) + \ + (defined(KOKKOS_COMPILER_GNU) ? 1 : 0) + \ + (defined(KOKKOS_COMPILER_NVHPC) ? 1 : 0) + \ + (defined(KOKKOS_COMPILER_MSVC) ? 1 : 0)) +#error "Only one host compiler macro can be defined" +#endif + #if defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA) #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #error "Macro bug: KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA shouldn't be defined" @@ -26,8 +37,6 @@ #endif #endif -#define KOKKOS_PRAGMA_UNROLL(a) - namespace TestCompilerMacros { template @@ -51,7 +60,7 @@ struct AddFunctor { #pragma vector always #endif #ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count(128) +#pragma loop_count(128) #endif for (int j = 0; j < length; j++) { a(i, j) += b(i, j); @@ -75,7 +84,7 @@ bool Test() { } // namespace TestCompilerMacros namespace Test { -TEST(TEST_CATEGORY, compiler_macros) { - ASSERT_TRUE((TestCompilerMacros::Test())); +TEST(defaultdevicetype, compiler_macros) { + ASSERT_TRUE((TestCompilerMacros::Test())); } } // namespace Test From d80f5805451601dc3354c1975de1dca2f23d7556 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Sat, 8 Apr 2023 14:24:18 -0400 Subject: [PATCH 367/496] Define KOKKOS_COMPILER_INTEL_LLVM --- .../continuous-integration-workflow.yml | 4 ++-- core/src/Kokkos_BitManipulation.hpp | 3 ++- core/src/Kokkos_Macros.hpp | 17 ++++++++--------- core/src/Kokkos_View.hpp | 4 ++-- core/src/impl/Kokkos_BitOps.hpp | 18 +++++++++--------- core/src/impl/Kokkos_Core.cpp | 6 ++++++ core/src/impl/Kokkos_HostSpace.cpp | 3 ++- core/unit_test/CMakeLists.txt | 1 + core/unit_test/TestCompilerMacros.cpp | 13 +++++++------ core/unit_test/TestInit.hpp | 2 -- core/unit_test/TestNumericTraits.hpp | 3 ++- .../TestViewMemoryAccessViolation.hpp | 2 +- core/unit_test/tools/TestLogicalSpaces.hpp | 2 +- 13 files changed, 43 insertions(+), 35 deletions(-) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 1f06dcd535..8581a76427 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -29,12 +29,12 @@ jobs: backend: 'OPENMP' - distro: 'fedora:intel' cxx: 'icpx' - cxx_extra_flags: '-fp-model=precise' + cxx_extra_flags: '-fp-model=precise -Wno-pass-failed' cmake_build_type: 'Release' backend: 'OPENMP' - distro: 'fedora:intel' cxx: 'icpx' - cxx_extra_flags: '-fp-model=precise' + cxx_extra_flags: '-fp-model=precise -Wno-pass-failed' cmake_build_type: 'Debug' backend: 'OPENMP' - distro: 'ubuntu:latest' diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index caf2e93d51..50cd92c986 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -223,7 +223,8 @@ rotr(T x, int s) noexcept { namespace Kokkos::Impl { -#if defined(KOKKOS_COMPILER_CLANG) || defined(KOKKOS_COMPILER_GCC) +#if defined(KOKKOS_COMPILER_CLANG) || defined(KOKKOS_COMPILER_INTEL_LLVM) || \ + defined(KOKKOS_COMPILER_GCC) #define KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS #endif diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 5a44391982..8cc4a6efa3 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -66,6 +66,7 @@ * KOKKOS_COMPILER_NVCC * KOKKOS_COMPILER_GNU * KOKKOS_COMPILER_INTEL + * KOKKOS_COMPILER_INTEL_LLVM * KOKKOS_COMPILER_CRAYC * KOKKOS_COMPILER_APPLECC * KOKKOS_COMPILER_CLANG @@ -125,7 +126,7 @@ #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER #elif defined(__INTEL_LLVM_COMPILER) -#define KOKKOS_COMPILER_INTEL __INTEL_LLVM_COMPILER +#define KOKKOS_COMPILER_INTEL_LLVM __INTEL_LLVM_COMPILER #elif defined(_CRAYC) // CRAY compiler for host code @@ -171,16 +172,13 @@ //---------------------------------------------------------------------------- // Intel compiler macros -#if defined(KOKKOS_COMPILER_INTEL) -// FIXME_ICPX -#if !defined(__INTEL_LLVM_COMPILER) +#if defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) +#if defined(KOKKOS_COMPILER_INTEL_LLVM) && \ + KOKKOS_COMPILER_INTEL_LLVM >= 20230100 #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 -#endif -// FIXME_SYCL -#if !defined(KOKKOS_ENABLE_SYCL) #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 #endif @@ -582,8 +580,9 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] -#if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ - defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_NVHPC)) && \ +#if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ + defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) || \ + defined(KOKKOS_COMPILER_NVHPC)) && \ !defined(_WIN32) && !defined(__ANDROID__) #if __has_include() #define KOKKOS_IMPL_ENABLE_STACKTRACE diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 1e399f9c59..0464f1a1a1 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -497,13 +497,13 @@ namespace Kokkos { // FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with // the OpenMPTarget backend -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) #pragma omp declare target #endif inline constexpr Kokkos::ALL_t ALL{}; -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) #pragma omp end declare target #endif diff --git a/core/src/impl/Kokkos_BitOps.hpp b/core/src/impl/Kokkos_BitOps.hpp index 16a28f2419..0d83127df9 100644 --- a/core/src/impl/Kokkos_BitOps.hpp +++ b/core/src/impl/Kokkos_BitOps.hpp @@ -21,7 +21,7 @@ #include #include -#ifdef KOKKOS_COMPILER_INTEL +#if defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) #include #endif @@ -45,7 +45,7 @@ inline int int_log2_device(unsigned i) { #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) constexpr int shift = sizeof(unsigned) * CHAR_BIT - 1; return shift - __clz(i); -#elif defined(KOKKOS_COMPILER_INTEL) +#elif defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) return _bit_scan_reverse(i); #else return int_log2_fallback(i); @@ -55,7 +55,7 @@ inline int int_log2_device(unsigned i) { KOKKOS_IMPL_HOST_FUNCTION inline int int_log2_host(unsigned i) { // duplicating shift to avoid unused warning in else branch -#if defined(KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) constexpr int shift = sizeof(unsigned) * CHAR_BIT - 1; (void)shift; return _bit_scan_reverse(i); @@ -104,7 +104,7 @@ inline int bit_first_zero_device(unsigned i) noexcept { constexpr unsigned full = ~0u; #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) return full != i ? __ffs(~i) - 1 : -1; -#elif defined(KOKKOS_COMPILER_INTEL) +#elif defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) return full != i ? _bit_scan_forward(~i) : -1; #else (void)full; @@ -115,7 +115,7 @@ inline int bit_first_zero_device(unsigned i) noexcept { KOKKOS_IMPL_HOST_FUNCTION inline int bit_first_zero_host(unsigned i) noexcept { constexpr unsigned full = ~0u; -#if defined(KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) return full != i ? _bit_scan_forward(~i) : -1; #elif defined(KOKKOS_COMPILER_CRAYC) return full != i ? _popcnt(i ^ (i + 1)) - 1 : -1; @@ -153,7 +153,7 @@ int bit_scan_forward_fallback(unsigned i) { KOKKOS_IMPL_DEVICE_FUNCTION inline int bit_scan_forward_device(unsigned i) { #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) return __ffs(i) - 1; -#elif defined(KOKKOS_COMPILER_INTEL) +#elif defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) return _bit_scan_forward(i); #else return bit_scan_forward_fallback(i); @@ -161,7 +161,7 @@ KOKKOS_IMPL_DEVICE_FUNCTION inline int bit_scan_forward_device(unsigned i) { } KOKKOS_IMPL_HOST_FUNCTION inline int bit_scan_forward_host(unsigned i) { -#if defined(KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) return _bit_scan_forward(i); #elif defined(KOKKOS_COMPILER_CRAYC) return i ? _popcnt(~i & (i - 1)) : -1; @@ -200,7 +200,7 @@ int bit_count_fallback(unsigned i) { KOKKOS_IMPL_DEVICE_FUNCTION inline int bit_count_device(unsigned i) { #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) return __popc(i); -#elif defined(KOKKOS_COMPILER_INTEL) +#elif defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) return _popcnt32(i); #else return bit_count_fallback(i); @@ -208,7 +208,7 @@ KOKKOS_IMPL_DEVICE_FUNCTION inline int bit_count_device(unsigned i) { } KOKKOS_IMPL_HOST_FUNCTION inline int bit_count_host(unsigned i) { -#if defined(KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) return _popcnt32(i); #elif defined(KOKKOS_COMPILER_CRAYC) return _popcnt(i); diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 9b685ba5be..deacab6ac1 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -500,6 +500,12 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { std::to_string(KOKKOS_COMPILER_INTEL)); declare_configuration_metadata("tools_only", "compiler_family", "intel"); #endif +#ifdef KOKKOS_COMPILER_INTEL_LLVM + declare_configuration_metadata("compiler_version", + "KOKKOS_COMPILER_INTEL_LLVM", + std::to_string(KOKKOS_COMPILER_INTEL_LLVM)); + declare_configuration_metadata("tools_only", "compiler_family", "intel_llvm"); +#endif #ifdef KOKKOS_COMPILER_NVCC declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_NVCC", std::to_string(KOKKOS_COMPILER_NVCC)); diff --git a/core/src/impl/Kokkos_HostSpace.cpp b/core/src/impl/Kokkos_HostSpace.cpp index 857340ae09..88dfaadb08 100644 --- a/core/src/impl/Kokkos_HostSpace.cpp +++ b/core/src/impl/Kokkos_HostSpace.cpp @@ -26,7 +26,8 @@ /*--------------------------------------------------------------------------*/ -#if defined(KOKKOS_COMPILER_INTEL) && !defined(KOKKOS_ENABLE_CUDA) +#if (defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)) && \ + !defined(KOKKOS_ENABLE_CUDA) // Intel specialized allocator does not interoperate with CUDA memory allocation diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index ef928003bc..9fbe7e92e0 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -821,6 +821,7 @@ SET(DEFAULT_DEVICE_SOURCES TestParseCmdLineArgsAndEnvVars.cpp TestSharedSpace.cpp TestSharedHostPinnedSpace.cpp + TestCompilerMacros.cpp default/TestDefaultDeviceType.cpp default/TestDefaultDeviceType_a1.cpp default/TestDefaultDeviceType_b1.cpp diff --git a/core/unit_test/TestCompilerMacros.cpp b/core/unit_test/TestCompilerMacros.cpp index e719ad83fb..b77368037e 100644 --- a/core/unit_test/TestCompilerMacros.cpp +++ b/core/unit_test/TestCompilerMacros.cpp @@ -17,12 +17,13 @@ #include #include -#if 1 != ((defined(KOKKOS_COMPILER_INTEL) ? 1 : 0) + \ - (defined(KOKKOS_COMPILER_CRAYC) ? 1 : 0) + \ - (defined(KOKKOS_COMPILER_APPLECC) ? 1 : 0) + \ - (defined(KOKKOS_COMPILER_CLANG) ? 1 : 0) + \ - (defined(KOKKOS_COMPILER_GNU) ? 1 : 0) + \ - (defined(KOKKOS_COMPILER_NVHPC) ? 1 : 0) + \ +#if 1 != ((defined(KOKKOS_COMPILER_INTEL) ? 1 : 0) + \ + (defined(KOKKOS_COMPILER_INTEL_LLVM) ? 1 : 0) + \ + (defined(KOKKOS_COMPILER_CRAYC) ? 1 : 0) + \ + (defined(KOKKOS_COMPILER_APPLECC) ? 1 : 0) + \ + (defined(KOKKOS_COMPILER_CLANG) ? 1 : 0) + \ + (defined(KOKKOS_COMPILER_GNU) ? 1 : 0) + \ + (defined(KOKKOS_COMPILER_NVHPC) ? 1 : 0) + \ (defined(KOKKOS_COMPILER_MSVC) ? 1 : 0)) #error "Only one host compiler macro can be defined" #endif diff --git a/core/unit_test/TestInit.hpp b/core/unit_test/TestInit.hpp index 7be9ef5f12..9a8dba8dc8 100644 --- a/core/unit_test/TestInit.hpp +++ b/core/unit_test/TestInit.hpp @@ -40,5 +40,3 @@ TEST(TEST_CATEGORY, dispatch) { test_dispatch(); } #endif } // namespace Test - -#include diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index 735022a107..94e67f73da 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -484,7 +484,8 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, denorm_min); // FIXME_OPENMPTARGET - The static_assert causes issues on Intel GPUs with the // OpenMPTarget backend. -#if !(defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL)) +#if !(defined(KOKKOS_ENABLE_OPENMPTARGET) && \ + defined(KOKKOS_COMPILER_INTEL_LLVM)) CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min); #endif diff --git a/core/unit_test/TestViewMemoryAccessViolation.hpp b/core/unit_test/TestViewMemoryAccessViolation.hpp index cdd90426eb..d8febd2ba1 100644 --- a/core/unit_test/TestViewMemoryAccessViolation.hpp +++ b/core/unit_test/TestViewMemoryAccessViolation.hpp @@ -143,7 +143,7 @@ void test_view_memory_access_violations_from_device() { } // FIXME_SYCL -#if !(defined(KOKKOS_COMPILER_INTEL) && defined(KOKKOS_ENABLE_SYCL)) +#if !(defined(KOKKOS_COMPILER_INTEL_LLVM) && defined(KOKKOS_ENABLE_SYCL)) TEST(TEST_CATEGORY_DEATH, view_memory_access_violations_from_host) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; diff --git a/core/unit_test/tools/TestLogicalSpaces.hpp b/core/unit_test/tools/TestLogicalSpaces.hpp index 1b20bd5933..4e56f8996a 100644 --- a/core/unit_test/tools/TestLogicalSpaces.hpp +++ b/core/unit_test/tools/TestLogicalSpaces.hpp @@ -165,7 +165,7 @@ TEST(defaultdevicetype, access_allowed) { test_allowed_access(); } // FIXME_SYCL -#if !(defined(KOKKOS_COMPILER_INTEL) && defined(KOKKOS_ENABLE_SYCL)) +#if !(defined(KOKKOS_COMPILER_INTEL_LLVM) && defined(KOKKOS_ENABLE_SYCL)) TEST(defaultdevicetype_DeathTest, access_forbidden) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; ASSERT_DEATH( From 48b34def1bf1cd48cfa1d2f155a766e04d36729f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 11 Apr 2023 16:18:01 -0400 Subject: [PATCH 368/496] Desul atomics: let relocatable device code mode be part of the configuration (#5991) * Desul atomics: Make compilation fail if SEPARABLE_COMPILATION configuration is not compatible with the relocatable device code mode * Desul atomics: Prefer #ifdef DESUL_ATOMICS_ENABLE_{CUDA,HIP}_SEPARABLE_DEVICE_CODE macro guards * Desul atomics: add DESUL_ATOMICS_ENABLE_{CUDA,HIP}_SEPARABLE_COMPILATION macros * Configure DESUL_ATOMICS_ENABLE_{CUDA,HIP}_SEPARABLE_COMPILATION in bundled version of the desul atomics * Merge latest version * Fixup Clang+CUDA defines __CLANG_RDC__ instead of __CUDACC_RDC__ --- Makefile.kokkos | 10 ++++ core/src/CMakeLists.txt | 6 ++ tpls/desul/Config.hpp.cmake.in | 2 + .../include/desul/atomics/Lock_Array_CUDA.hpp | 8 +-- .../include/desul/atomics/Lock_Array_HIP.hpp | 8 +-- .../Lock_Based_Fetch_Op_Unimplemented.hpp | 55 ------------------- tpls/desul/include/desul/atomics/Macros.hpp | 28 ++++++++++ tpls/desul/src/Lock_Array_CUDA.cpp | 4 +- tpls/desul/src/Lock_Array_HIP.cpp | 4 +- 9 files changed, 58 insertions(+), 67 deletions(-) delete mode 100644 tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_Unimplemented.hpp diff --git a/Makefile.kokkos b/Makefile.kokkos index e7e83ab828..11607544b7 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1399,12 +1399,22 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_CUDA */") endif +ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION") +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION */") +endif ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_HIP") else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_HIP */") endif +ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION") +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION */") +endif ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_SYCL") diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index 09e91929d5..012af0a7d0 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -7,9 +7,15 @@ IF (NOT desul_FOUND) IF(KOKKOS_ENABLE_CUDA) SET(DESUL_ATOMICS_ENABLE_CUDA ON) ENDIF() + IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + SET(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) + ENDIF() IF(KOKKOS_ENABLE_HIP) SET(DESUL_ATOMICS_ENABLE_HIP ON) ENDIF() + IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + SET(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) + ENDIF() IF(KOKKOS_ENABLE_SYCL) SET(DESUL_ATOMICS_ENABLE_SYCL ON) ENDIF() diff --git a/tpls/desul/Config.hpp.cmake.in b/tpls/desul/Config.hpp.cmake.in index 40ab5c1c6c..a7bc738191 100644 --- a/tpls/desul/Config.hpp.cmake.in +++ b/tpls/desul/Config.hpp.cmake.in @@ -10,7 +10,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #define DESUL_ATOMICS_CONFIG_HPP_ #cmakedefine DESUL_ATOMICS_ENABLE_CUDA +#cmakedefine DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_HIP +#cmakedefine DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_SYCL #cmakedefine DESUL_ATOMICS_ENABLE_OPENMP diff --git a/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp b/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp index b4dc4dae74..ebfb8172e5 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp @@ -59,12 +59,12 @@ void finalize_lock_arrays_cuda(); /// variable based on the Host global variable prior to running any kernels /// that will use it. /// That is the purpose of the ensure_cuda_lock_arrays_on_device function. -#ifdef __CUDACC_RDC__ +#ifdef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION extern #endif __device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE; -#ifdef __CUDACC_RDC__ +#ifdef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION extern #endif __device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE; @@ -108,7 +108,7 @@ __device__ inline void unlock_address_cuda(void* ptr, desul::MemoryScopeNode) { atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[offset], 0); } -#ifdef __CUDACC_RDC__ +#ifdef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION inline #else inline static @@ -132,7 +132,7 @@ inline static namespace desul { -#if defined(__CUDACC_RDC__) +#ifdef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION inline void ensure_cuda_lock_arrays_on_device() {} #else static inline void ensure_cuda_lock_arrays_on_device() { diff --git a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp index b80e2d4599..beca3e9e40 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp @@ -63,12 +63,12 @@ void finalize_lock_arrays_hip(); * will use it. That is the purpose of the * ensure_hip_lock_arrays_on_device function. */ -#ifdef __CLANG_RDC__ +#ifdef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION extern #endif __device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE; -#ifdef __CLANG_RDC__ +#ifdef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION extern #endif __device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE; @@ -115,7 +115,7 @@ __device__ inline void unlock_address_hip(void* ptr, desul::MemoryScopeNode) { atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[offset], 0); } -#ifdef __CLANG_RDC__ +#ifdef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION inline #else inline static @@ -135,7 +135,7 @@ inline static } } // namespace Impl -#if defined(__CLANG_RDC__) +#ifdef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION inline void ensure_hip_lock_arrays_on_device() {} #else static inline void ensure_hip_lock_arrays_on_device() { diff --git a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_Unimplemented.hpp b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_Unimplemented.hpp deleted file mode 100644 index b9f9fac535..0000000000 --- a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_Unimplemented.hpp +++ /dev/null @@ -1,55 +0,0 @@ -/* -Copyright (c) 2019, Lawrence Livermore National Security, LLC -and DESUL project contributors. See the COPYRIGHT file for details. -Source: https://github.com/desul/desul - -SPDX-License-Identifier: (BSD-3-Clause) -*/ - -#ifndef DESUL_ATOMICS_LOCK_BASED_FETCH_OP_UNIMPLEMENTED_HPP_ -#define DESUL_ATOMICS_LOCK_BASED_FETCH_OP_UNIMPLEMENTED_HPP_ - -#include -#include -#include - -namespace desul { -namespace Impl { - -template = 0> -DESUL_INLINE_FUNCTION T -device_atomic_fetch_oper(const Oper& /*op*/, - T* const /*dest*/, - dont_deduce_this_parameter_t val, - MemoryOrder /*order*/, - MemoryScope /*scope*/) { - assert(false); - return val; // FIXME not implemented -} - -template = 0> -DESUL_INLINE_FUNCTION T -device_atomic_oper_fetch(const Oper& /*op*/, - T* const /*dest*/, - dont_deduce_this_parameter_t val, - MemoryOrder /*order*/, - MemoryScope /*scope*/) { - assert(false); - return val; // FIXME not implemented -} -} // namespace Impl -} // namespace desul - -#endif diff --git a/tpls/desul/include/desul/atomics/Macros.hpp b/tpls/desul/include/desul/atomics/Macros.hpp index 5b4df2661e..3a14b93d32 100644 --- a/tpls/desul/include/desul/atomics/Macros.hpp +++ b/tpls/desul/include/desul/atomics/Macros.hpp @@ -11,6 +11,34 @@ SPDX-License-Identifier: (BSD-3-Clause) #include +// Intercept incompatible relocatable device code mode which leads to ODR violations +#ifdef DESUL_ATOMICS_ENABLE_CUDA +#if (defined(__clang__) && defined(__CUDA__) && defined(__CLANG_RDC__)) || \ + defined(__CUDACC_RDC__) +#define DESUL_IMPL_CUDA_RDC +#endif + +#if (defined(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION) && \ + !defined(DESUL_IMPL_CUDA_RDC)) || \ + (!defined(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION) && \ + defined(DESUL_IMPL_CUDA_RDC)) +#error Relocatable device code mode incompatible with desul atomics configuration +#endif + +#ifdef DESUL_IMPL_CUDA_RDC +#undef DESUL_IMPL_CUDA_RDC +#endif +#endif + +#ifdef DESUL_ATOMICS_ENABLE_HIP +#if (defined(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION) && \ + !defined(__CLANG_RDC__)) || \ + (!defined(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION) && \ + defined(__CLANG_RDC__)) +#error Relocatable device code mode incompatible with desul atomics configuration +#endif +#endif + // Macros #if defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDACC__) diff --git a/tpls/desul/src/Lock_Array_CUDA.cpp b/tpls/desul/src/Lock_Array_CUDA.cpp index d8ab895b2b..155f33653e 100644 --- a/tpls/desul/src/Lock_Array_CUDA.cpp +++ b/tpls/desul/src/Lock_Array_CUDA.cpp @@ -11,7 +11,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #include #include -#ifdef __CUDACC_RDC__ +#ifdef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION namespace desul { namespace Impl { __device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE = nullptr; @@ -83,7 +83,7 @@ void finalize_lock_arrays_cuda() { cudaFreeHost(CUDA_SPACE_ATOMIC_LOCKS_NODE_h); CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; -#ifdef __CUDACC_RDC__ +#ifdef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION copy_cuda_lock_arrays_to_device(); #endif } diff --git a/tpls/desul/src/Lock_Array_HIP.cpp b/tpls/desul/src/Lock_Array_HIP.cpp index 0611c85aeb..465b2eb25a 100644 --- a/tpls/desul/src/Lock_Array_HIP.cpp +++ b/tpls/desul/src/Lock_Array_HIP.cpp @@ -11,7 +11,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #include #include -#ifdef __CLANG_RDC__ +#ifdef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION namespace desul { namespace Impl { __device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE = nullptr; @@ -87,7 +87,7 @@ void finalize_lock_arrays_hip() { check_error_and_throw_hip(error_free2, "finalize_lock_arrays_hip: free host locks"); HIP_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; -#ifdef __CLANG_RDC__ +#ifdef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION copy_hip_lock_arrays_to_device(); #endif } From bb7ae99eca0d558f460038b7dd9df9aba3592d64 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 11 Apr 2023 15:29:12 -0600 Subject: [PATCH 369/496] CHANGELOG.md: add threads sort --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03cf75ff88..94c129e17d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - `parallel_scan` with `View` as result type. [\#5146](https://github.com/kokkos/kokkos/pull/5146) - Introduced `SharedSpace`, an alias for a `MemorySpace` that is accessible by every `ExecutionSpace`. The memory is moved and then accessed locally. [\#5289](https://github.com/kokkos/kokkos/pull/5289) - Introduced `SharedHostPinnedSpace`, an alias for a `MemorySpace` that is accessible by every `ExecutionSpace`. The memory is pinned to the host and accessed via zero-copy access. [\#5405](https://github.com/kokkos/kokkos/pull/5405) +- Add team- and thread-level `sort`, `sort_by_key` algorithms. [\#5317](https://github.com/kokkos/kokkos/pull/5317) - Groundwork for `MDSpan` integration. [\#4973](https://github.com/kokkos/kokkos/pull/4973) and [\#5304](https://github.com/kokkos/kokkos/pull/5304) - Introduced MD version of hierarchical parallelism: `TeamThreadMDRange`, `ThreadVectorMDRange` and `TeamVectorMDRange`. [\#5238](https://github.com/kokkos/kokkos/pull/5238) From 83873a62bb4d6ce47ec39e7775300b7ecdddfe73 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 12 Apr 2023 08:11:07 -0400 Subject: [PATCH 370/496] Remove Kokkos Keyword Listing section from BUILD.md and refer to the wiki instead --- BUILD.md | 238 +------------------------------------------------------ 1 file changed, 1 insertion(+), 237 deletions(-) diff --git a/BUILD.md b/BUILD.md index cfeed1044d..5d5cd59878 100644 --- a/BUILD.md +++ b/BUILD.md @@ -111,243 +111,7 @@ For dev-build details, consult the kokkos-spack repository [README](https://gith # Kokkos Keyword Listing -## Device Backends -Device backends can be enabled by specifying `-DKokkos_ENABLE_X`. - -* Kokkos_ENABLE_CUDA - * Whether to build CUDA backend - * BOOL Default: OFF -* Kokkos_ENABLE_HPX - * Whether to build HPX backend (experimental) - * BOOL Default: OFF -* Kokkos_ENABLE_OPENMP - * Whether to build OpenMP backend - * BOOL Default: OFF -* Kokkos_ENABLE_THREADS - * Whether to build C++ thread backend - * BOOL Default: OFF -* Kokkos_ENABLE_SERIAL - * Whether to build serial backend - * BOOL Default: ON -* Kokkos_ENABLE_HIP (Experimental) - * Whether to build HIP backend - * BOOL Default: OFF -* Kokkos_ENABLE_OPENMPTARGET (Experimental) - * Whether to build the OpenMP target backend - * BOOL Default: OFF - -## Enable Options -Options can be enabled by specifying `-DKokkos_ENABLE_X`. - -* Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION - * Whether to aggressively vectorize loops - * BOOL Default: OFF -* Kokkos_ENABLE_COMPILER_WARNINGS - * Whether to print all compiler warnings - * BOOL Default: OFF -* Kokkos_ENABLE_CUDA_CONSTEXPR - * Whether to activate experimental relaxed constexpr functions - * BOOL Default: OFF -* Kokkos_ENABLE_CUDA_LAMBDA - * Whether to activate experimental lambda features - * BOOL Default: OFF -* Kokkos_ENABLE_CUDA_LDG_INTRINSIC - * Deprecated since 4.0, LDG intrinsics are always enabled. - * Whether to use CUDA LDG intrinsics - * BOOL Default: OFF -* Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - * Whether to enable relocatable device code (RDC) for CUDA - * BOOL Default: OFF -* Kokkos_ENABLE_CUDA_UVM - * Deprecated since 4.0 - * Whether to use unified memory (UM) by default for CUDA - * BOOL Default: OFF -* Kokkos_ENABLE_DEBUG - * Whether to activate extra debug features - may increase compile times - * BOOL Default: OFF -* Kokkos_ENABLE_DEBUG_BOUNDS_CHECK - * Whether to use bounds checking - will increase runtime - * BOOL Default: OFF -* Kokkos_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK - * Debug check on dual views - * BOOL Default: OFF -* Kokkos_ENABLE_EXAMPLES - * Whether to enable building examples - * BOOL Default: OFF -* Kokkos_ENABLE_IMPL_HPX_ASYNC_DISPATCH - * Whether HPX supports asynchronous dispatch - * BOOL Default: ON -* Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC - * Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2). This is an experimental performance feature and currently has issue when using with UCX. See https://github.com/kokkos/kokkos/issues/4228 for more details. - * BOOL Default: OFF -* Kokkos_ENABLE_LARGE_MEM_TESTS - * Whether to perform extra large memory tests - * BOOL_Default: OFF -* Kokkos_ENABLE_PROFILING_LOAD_PRINT - * Whether to print information about which profiling tools gotloaded - * BOOL Default: OFF -* Kokkos_ENABLE_TESTS - * Whether to enable test suite - * BOOL Default: OFF - - -## Third-party Libraries (TPLs) -The following options control enabling TPLs: -* Kokkos_ENABLE_HPX - * Whether to enable the HPX library - * BOOL Default: OFF -* Kokkos_ENABLE_HWLOC - * Whether to enable the HWLOC library - * BOOL Default: Off -* Kokkos_ENABLE_LIBNUMA - * Whether to enable the LIBNUMA library - * BOOL Default: Off -* Kokkos_ENABLE_MEMKIND - * Whether to enable the MEMKIND library - * BOOL Default: Off -* Kokkos_ENABLE_LIBDL - * Whether to enable the LIBDL library - * BOOL Default: On -* Kokkos_ENABLE_LIBRT - * Whether to enable the LIBRT library - * BOOL Default: Off - -The following options control finding and configuring non-CMake TPLs: -* Kokkos_CUDA_DIR or CUDA_ROOT - * Location of CUDA install prefix for libraries - * PATH Default: -* Kokkos_HWLOC_DIR or HWLOC_ROOT - * Location of HWLOC install prefix - * PATH Default: -* Kokkos_LIBNUMA_DIR or LIBNUMA_ROOT - * Location of LIBNUMA install prefix - * PATH Default: -* Kokkos_MEMKIND_DIR or MEMKIND_ROOT - * Location of MEMKIND install prefix - * PATH Default: -* Kokkos_LIBDL_DIR or LIBDL_ROOT - * Location of LIBDL install prefix - * PATH Default: -* Kokkos_LIBRT_DIR or LIBRT_ROOT - * Location of LIBRT install prefix - * PATH Default: - -The following options control `find_package` paths for CMake-based TPLs: -* HPX_DIR or HPX_ROOT - * Location of HPX prefix (ROOT) or CMake config file (DIR) - * PATH Default: - -## Architecture Keywords -Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_X`. - -* Kokkos_ARCH_NATIVE - * Whether to optimize for the the local CPU architecture - * BOOL Default: OFF -* Kokkos_ARCH_AMDAVX - * Whether to optimize for the AMDAVX architecture - * BOOL Default: OFF -* Kokkos_ARCH_ARMV80 - * Whether to optimize for the ARMV80 architecture - * BOOL Default: OFF -* Kokkos_ARCH_ARMV81 - * Whether to optimize for the ARMV81 architecture - * BOOL Default: OFF -* Kokkos_ARCH_ARMV8_THUNDERX - * Whether to optimize for the ARMV8_THUNDERX architecture - * BOOL Default: OFF -* Kokkos_ARCH_ARMV8_TX2 - * Whether to optimize for the ARMV8_TX2 architecture - * BOOL Default: OFF -* Kokkos_ARCH_BDW - * Whether to optimize for the BDW architecture - * BOOL Default: OFF -* Kokkos_ARCH_BGQ - * Whether to optimize for the BGQ architecture - * BOOL Default: OFF -* Kokkos_ARCH_ZEN - * Whether to optimize for the Zen architecture - * BOOL Default: OFF -* Kokkos_ARCH_ZEN2 - * Whether to optimize for the Zen2 architecture - * BOOL Default: OFF -* Kokkos_ARCH_ZEN3 - * Whether to optimize for the Zen3 architecture - * BOOL Default: OFF -* Kokkos_ARCH_HSW - * Whether to optimize for the HSW architecture - * BOOL Default: OFF -* Kokkos_ARCH_KEPLER30 - * Whether to optimize for the KEPLER30 architecture - * BOOL Default: OFF -* Kokkos_ARCH_KEPLER32 - * Whether to optimize for the KEPLER32 architecture - * BOOL Default: OFF -* Kokkos_ARCH_KEPLER35 - * Whether to optimize for the KEPLER35 architecture - * BOOL Default: OFF -* Kokkos_ARCH_KEPLER37 - * Whether to optimize for the KEPLER37 architecture - * BOOL Default: OFF -* Kokkos_ARCH_KNC - * Whether to optimize for the KNC architecture - * BOOL Default: OFF -* Kokkos_ARCH_KNL - * Whether to optimize for the KNL architecture - * BOOL Default: OFF -* Kokkos_ARCH_MAXWELL50 - * Whether to optimize for the MAXWELL50 architecture - * BOOL Default: OFF -* Kokkos_ARCH_MAXWELL52 - * Whether to optimize for the MAXWELL52 architecture - * BOOL Default: OFF -* Kokkos_ARCH_MAXWELL53 - * Whether to optimize for the MAXWELL53 architecture - * BOOL Default: OFF -* Kokkos_ARCH_PASCAL60 - * Whether to optimize for the PASCAL60 architecture - * BOOL Default: OFF -* Kokkos_ARCH_PASCAL61 - * Whether to optimize for the PASCAL61 architecture - * BOOL Default: OFF -* Kokkos_ARCH_POWER7 - * Whether to optimize for the POWER7 architecture - * BOOL Default: OFF -* Kokkos_ARCH_POWER8 - * Whether to optimize for the POWER8 architecture - * BOOL Default: OFF -* Kokkos_ARCH_POWER9 - * Whether to optimize for the POWER9 architecture - * BOOL Default: OFF -* Kokkos_ARCH_ICL - * Whether to optimize for the ICL architecture - * BOOL Default: OFF -* Kokkos_ARCH_ICX - * Whether to optimize for the ICX architecture - * BOOL Default: OFF -* Kokkos_ARCH_SKL - * Whether to optimize for the SKL architecture - * BOOL Default: OFF -* Kokkos_ARCH_SKX - * Whether to optimize for the SKX architecture - * BOOL Default: OFF -* Kokkos_ARCH_SNB - * Whether to optimize for the SNB architecture - * BOOL Default: OFF -* Kokkos_ARCH_SPR - * Whether to optimize for the SPR architecture - * BOOL Default: OFF -* Kokkos_ARCH_TURING75 - * Whether to optimize for the TURING75 architecture - * BOOL Default: OFF -* Kokkos_ARCH_VOLTA70 - * Whether to optimize for the VOLTA70 architecture - * BOOL Default: OFF -* Kokkos_ARCH_VOLTA72 - * Whether to optimize for the VOLTA72 architecture - * BOOL Default: OFF -* Kokkos_ARCH_WSM - * Whether to optimize for the WSM architecture - * BOOL Default: OFF +Please refer to our [wiki](https://kokkos.github.io/kokkos-core-wiki/keywords.html#cmake-keywords). ##### [LICENSE](https://github.com/kokkos/kokkos/blob/devel/LICENSE) From d0f5777061c4ed24fde83b6bad52a4d134724f85 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 12 Apr 2023 08:11:46 -0400 Subject: [PATCH 371/496] Remove (outdated) license information [ci skip] --- BUILD.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/BUILD.md b/BUILD.md index 5d5cd59878..f80320e78b 100644 --- a/BUILD.md +++ b/BUILD.md @@ -112,10 +112,3 @@ For dev-build details, consult the kokkos-spack repository [README](https://gith # Kokkos Keyword Listing Please refer to our [wiki](https://kokkos.github.io/kokkos-core-wiki/keywords.html#cmake-keywords). - -##### [LICENSE](https://github.com/kokkos/kokkos/blob/devel/LICENSE) - -[![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) - -Under the terms of Contract DE-NA0003525 with NTESS, -the U.S. Government retains certain rights in this software. From 3b1afb530e647527c1d9c3228e00565606ac9050 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 12 Apr 2023 15:54:49 -0400 Subject: [PATCH 372/496] Remove libnuma (#6048) --- cmake/Modules/FindTPLLIBNUMA.cmake | 1 - cmake/kokkos_tpls.cmake | 2 -- 2 files changed, 3 deletions(-) delete mode 100644 cmake/Modules/FindTPLLIBNUMA.cmake diff --git a/cmake/Modules/FindTPLLIBNUMA.cmake b/cmake/Modules/FindTPLLIBNUMA.cmake deleted file mode 100644 index 811db5851b..0000000000 --- a/cmake/Modules/FindTPLLIBNUMA.cmake +++ /dev/null @@ -1 +0,0 @@ -KOKKOS_FIND_IMPORTED(LIBNUMA HEADER numa.h LIBRARY numa) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index c768bfe8de..9975b174c2 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -32,7 +32,6 @@ FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) ENDFUNCTION() KOKKOS_TPL_OPTION(HWLOC Off) -KOKKOS_TPL_OPTION(LIBNUMA Off) KOKKOS_TPL_OPTION(MEMKIND Off) IF(KOKKOS_ENABLE_MEMKIND) SET(KOKKOS_ENABLE_HBWSPACE ON) @@ -77,7 +76,6 @@ IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) KOKKOS_IMPORT_TPL(CUDA INTERFACE) ENDIF() KOKKOS_IMPORT_TPL(HWLOC) -KOKKOS_IMPORT_TPL(LIBNUMA) KOKKOS_IMPORT_TPL(LIBRT) KOKKOS_IMPORT_TPL(LIBDL) KOKKOS_IMPORT_TPL(MEMKIND) From d26f88ce421612b2eaf9dd0f3dbe15f2d450ecbf Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 12 Apr 2023 15:42:17 -0400 Subject: [PATCH 373/496] Don't create a shared state for size() in UnorderedMap's deep_copy --- containers/src/Kokkos_UnorderedMap.hpp | 2 +- containers/unit_tests/TestUnorderedMap.hpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/containers/src/Kokkos_UnorderedMap.hpp b/containers/src/Kokkos_UnorderedMap.hpp index 5ed51c2948..377168fa55 100644 --- a/containers/src/Kokkos_UnorderedMap.hpp +++ b/containers/src/Kokkos_UnorderedMap.hpp @@ -769,7 +769,7 @@ class UnorderedMap { tmp.m_bounded_insert = src.m_bounded_insert; tmp.m_hasher = src.m_hasher; tmp.m_equal_to = src.m_equal_to; - tmp.m_size = src.m_size; + *tmp.m_size = *src.m_size; tmp.m_available_indexes = bitset_type(src.capacity()); tmp.m_hash_lists = size_type_view( view_alloc(WithoutInitializing, "UnorderedMap hash list"), diff --git a/containers/unit_tests/TestUnorderedMap.hpp b/containers/unit_tests/TestUnorderedMap.hpp index 977d3dd945..c2d941b5bf 100644 --- a/containers/unit_tests/TestUnorderedMap.hpp +++ b/containers/unit_tests/TestUnorderedMap.hpp @@ -85,6 +85,11 @@ struct TestInsert { } } } + + const unsigned int old_size = map_h.size(); + map_h.clear(); + ASSERT_EQ(map.size(), old_size); + ASSERT_EQ(map_h.size(), 0u); } KOKKOS_INLINE_FUNCTION From 0d96f88daf7b6b748c819cdd6d063d2174acbfed Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Thu, 13 Apr 2023 13:26:22 -0700 Subject: [PATCH 374/496] OpenMPTarget: Changes to Makefile.kokkos (#6053) * OpenMPTarget: Changes to Makefile.kokkos. * OpenMPTarget: Re-enable options accidentally deleted. * OpenMPTarget: Replace accidentally deleted lines for HIP. --------- Co-authored-by: Rahulkumar Gayatri --- Makefile.kokkos | 237 +++++++++++++++++++++++------------------------- 1 file changed, 115 insertions(+), 122 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 11607544b7..42b802659e 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -266,13 +266,9 @@ else KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp endif endif -ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - #KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_BUG_WORKAROUND_IBM_CLANG_OMP45_VIEW_INIT -fopenmp-implicit-declare-target -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp -fopenmp=libomp - KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_WORKAROUND_OPENMPTARGET_CLANG -fopenmp -fopenmp=libomp -Wno-openmp-mapping - KOKKOS_INTERNAL_OPENMPTARGET_LIB := -lomptarget -else ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1) +ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1) KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fiopenmp -Wno-openmp-mapping -else +else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 0) #Assume GCC KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fopenmp -foffload=nvptx-none endif @@ -975,136 +971,133 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-fopenmp-targets=nvptx64 -Xopenmp-target -march + KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-fopenmp --offload-arch endif - KOKKOS_INTERNAL_USE_CUDA_ARCH = 1 endif -ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_89 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90 - endif +# Lets start with adding architecture defines +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_89 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90 +endif - ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) - KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) +ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) - ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) endif - ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) - KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) - endif - endif endif endif # Figure out the architecture flag for ROCm. -ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) - # Lets start with adding architecture defines - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA906), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA906") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA908), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NAVI1030), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI1030") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1030 - endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NAVI1100), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI1100") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100 - endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA906), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA906") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA908), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NAVI1030), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI1030") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1030 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NAVI1100), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI1100") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100 +endif +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp) KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp) From 079268c9d511cf0012efb31329ae79f0f93bfbbd Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 13 Apr 2023 21:17:46 +0000 Subject: [PATCH 375/496] Partially reverse #5504 --- core/src/SYCL/Kokkos_SYCL_Space.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Space.cpp b/core/src/SYCL/Kokkos_SYCL_Space.cpp index 62af720b7c..dab4f4b3bf 100644 --- a/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -40,7 +40,13 @@ void DeepCopySYCL(void* dst, const void* src, size_t n) { void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, const void* src, size_t n) { sycl::queue& q = *instance.impl_internal_space_instance()->m_queue; - auto event = q.memcpy(dst, src, n); + // FIXME_SYCL memcpy doesn't respect submit_barrier which means that we need + // to actually fence the execution space to make sure the memcpy is properly + // enqueued when using out-of-order queues. +#ifndef KOKKOS_ARCH_INTEL_GPU + q.wait_and_throw(); +#endif + auto event = q.memcpy(dst, src, n); q.ext_oneapi_submit_barrier(std::vector{event}); } From 4feae9ea03f7400d2cbf34d46e7499efe439cf23 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Fri, 14 Apr 2023 10:34:27 -0400 Subject: [PATCH 376/496] Reduce size of ScatterView test when using OpenMP --- containers/unit_tests/TestScatterView.hpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/containers/unit_tests/TestScatterView.hpp b/containers/unit_tests/TestScatterView.hpp index 8dc3c423a7..2238a43c8a 100644 --- a/containers/unit_tests/TestScatterView.hpp +++ b/containers/unit_tests/TestScatterView.hpp @@ -773,9 +773,18 @@ TEST(TEST_CATEGORY, scatterview) { int big_n = 100 * 1000; #else -#ifdef KOKKOS_ENABLE_SERIAL +#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) +#if defined(KOKKOS_ENABLE_SERIAL) bool is_serial = std::is_same::value; - int big_n = is_serial ? 100 * 1000 : 10000 * 1000; +#else + bool is_serial = false; +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + bool is_openmp = std::is_same::value; +#else + bool is_openmp = false; +#endif + int big_n = is_serial || is_openmp ? 100 * 1000 : 10000 * 1000; #else int big_n = 10000 * 1000; #endif From 5680563230b27f08613bd4dbf8e66da6103d11b0 Mon Sep 17 00:00:00 2001 From: Stan Gerald Moore Date: Tue, 18 Apr 2023 11:07:41 -0600 Subject: [PATCH 377/496] Fix bug in Makefile.kokkos Co-authored-by: Christian Trott --- Makefile.kokkos | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 42b802659e..1234f4cc9e 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1426,7 +1426,7 @@ tmp := $(call desul_append_header, "$H""endif") DESUL_INTERNAL_LS_CONFIG := $(shell ls $(DESUL_CONFIG_HEADER) 2>&1) ifeq ($(DESUL_INTERNAL_LS_CONFIG), $(DESUL_CONFIG_HEADER)) - KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff $(DESUL_CONFIG_HEADER) $(DESUL_INTERNAL_CONFIG_TMP) | grep -c define)) + DESUL_INTERNAL_NEW_CONFIG := $(strip $(shell diff $(DESUL_CONFIG_HEADER) $(DESUL_INTERNAL_CONFIG_TMP) | grep -c define)) else DESUL_INTERNAL_NEW_CONFIG := 1 endif From db802ac0eccc3978f3667e05f66e808b4da99c66 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 18 Apr 2023 15:43:39 -0400 Subject: [PATCH 378/496] Fix join for ValueWrapperForNoNeutralElement --- .../impl/Kokkos_ExclusiveScan.hpp | 2 + .../impl/Kokkos_InclusiveScan.hpp | 2 + .../impl/Kokkos_TransformExclusiveScan.hpp | 2 + .../impl/Kokkos_TransformInclusiveScan.hpp | 4 ++ .../TestStdAlgorithmsExclusiveScan.cpp | 40 +++++++++++ .../TestStdAlgorithmsInclusiveScan.cpp | 39 +++++++++++ ...estStdAlgorithmsTransformExclusiveScan.cpp | 53 +++++++++++++++ ...estStdAlgorithmsTransformInclusiveScan.cpp | 67 +++++++++++++++++++ 8 files changed, 209 insertions(+) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp index f2bfa23ccd..71f13e490a 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp @@ -96,6 +96,8 @@ struct ExclusiveScanDefaultFunctor { KOKKOS_FUNCTION void join(value_type& update, const value_type& input) const { + if (input.is_initial) return; + if (update.is_initial) { update.val = input.val; update.is_initial = false; diff --git a/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp index 55e1a78695..ecd6ff39cd 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp @@ -90,6 +90,8 @@ struct InclusiveScanDefaultFunctor { KOKKOS_FUNCTION void join(value_type& update, const value_type& input) const { + if (input.is_initial) return; + if (update.is_initial) { update.val = input.val; } else { diff --git a/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp index 773e8c2f88..3bb337de36 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp @@ -76,6 +76,8 @@ struct TransformExclusiveScanFunctor { KOKKOS_FUNCTION void join(value_type& update, const value_type& input) const { + if (input.is_initial) return; + if (update.is_initial) { update.val = input.val; } else { diff --git a/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp index 9dde2b0fb1..05f8589086 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp @@ -67,6 +67,8 @@ struct TransformInclusiveScanNoInitValueFunctor { KOKKOS_FUNCTION void join(value_type& update, const value_type& input) const { + if (input.is_initial) return; + if (update.is_initial) { update.val = input.val; } else { @@ -118,6 +120,8 @@ struct TransformInclusiveScanWithInitValueFunctor { KOKKOS_FUNCTION void join(value_type& update, const value_type& input) const { + if (input.is_initial) return; + if (update.is_initial) { update.val = input.val; } else { diff --git a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index 4969541a02..8cd6097ee6 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -344,6 +344,46 @@ TEST(std_algorithms_numeric_ops_test, exclusive_scan) { run_exclusive_scan_all_scenarios(); } +TEST(std_algorithms_numeric_ops_test, exclusive_scan_functor) { + int dummy = 0; + using view_type = Kokkos::View; + view_type dummy_view("dummy_view", 0); + using functor_type = Kokkos::Experimental::Impl::ExclusiveScanDefaultFunctor< + exespace, int, int, view_type, view_type>; + functor_type functor(dummy, dummy_view, dummy_view); + using value_type = functor_type::value_type; + + value_type value1; + functor.init(value1); + EXPECT_EQ(value1.val, 0); + EXPECT_EQ(value1.is_initial, true); + + value_type value2; + value2.val = 1; + value2.is_initial = false; + functor.join(value1, value2); + EXPECT_EQ(value1.val, 1); + EXPECT_EQ(value1.is_initial, false); + + functor.init(value1); + functor.join(value2, value1); + EXPECT_EQ(value2.val, 1); + EXPECT_EQ(value2.is_initial, false); + + functor.init(value2); + functor.join(value2, value1); + EXPECT_EQ(value2.val, 0); + EXPECT_EQ(value2.is_initial, true); + + value1.val = 1; + value1.is_initial = false; + value2.val = 2; + value2.is_initial = false; + functor.join(value2, value1); + EXPECT_EQ(value2.val, 3); + EXPECT_EQ(value2.is_initial, false); +} + } // namespace EScan } // namespace stdalgos } // namespace Test diff --git a/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index 510f1d195a..7ddc142ad5 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -353,6 +353,45 @@ TEST(std_algorithms_numeric_ops_test, inclusive_scan) { run_inclusive_scan_all_scenarios(); } +TEST(std_algorithms_numeric_ops_test, inclusive_scan_functor) { + using view_type = Kokkos::View; + view_type dummy_view("dummy_view", 0); + using functor_type = Kokkos::Experimental::Impl::InclusiveScanDefaultFunctor< + exespace, int, int, view_type, view_type>; + functor_type functor(dummy_view, dummy_view); + using value_type = functor_type::value_type; + + value_type value1; + functor.init(value1); + EXPECT_EQ(value1.val, 0); + EXPECT_EQ(value1.is_initial, true); + + value_type value2; + value2.val = 1; + value2.is_initial = false; + functor.join(value1, value2); + EXPECT_EQ(value1.val, 1); + EXPECT_EQ(value1.is_initial, false); + + functor.init(value1); + functor.join(value2, value1); + EXPECT_EQ(value2.val, 1); + EXPECT_EQ(value2.is_initial, false); + + functor.init(value2); + functor.join(value2, value1); + EXPECT_EQ(value2.val, 0); + EXPECT_EQ(value2.is_initial, true); + + value1.val = 1; + value1.is_initial = false; + value2.val = 2; + value2.is_initial = false; + functor.join(value2, value1); + EXPECT_EQ(value2.val, 3); + EXPECT_EQ(value2.is_initial, false); +} + } // namespace IncScan } // namespace stdalgos } // namespace Test diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index 70c04dbafa..fcbccc221c 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -279,6 +279,59 @@ TEST(std_algorithms_numeric_ops_test, transform_exclusive_scan) { } #endif +template +struct MultiplyFunctor { + KOKKOS_INLINE_FUNCTION + ValueType operator()(const ValueType& a, const ValueType& b) const { + return (a * b); + } +}; + +TEST(std_algorithms_numeric_ops_test, transform_exclusive_scan_functor) { + int dummy = 0; + using view_type = Kokkos::View; + view_type dummy_view("dummy_view", 0); + using unary_op_type = + Kokkos::Experimental::Impl::StdNumericScanIdentityReferenceUnaryFunctor< + int>; + using functor_type = + Kokkos::Experimental::Impl::TransformExclusiveScanFunctor< + exespace, int, int, view_type, view_type, MultiplyFunctor, + unary_op_type>; + functor_type functor(dummy, dummy_view, dummy_view, {}, {}); + using value_type = functor_type::value_type; + + value_type value1; + functor.init(value1); + EXPECT_EQ(value1.val, 0); + EXPECT_EQ(value1.is_initial, true); + + value_type value2; + value2.val = 1; + value2.is_initial = false; + functor.join(value1, value2); + EXPECT_EQ(value1.val, 1); + EXPECT_EQ(value1.is_initial, false); + + functor.init(value1); + functor.join(value2, value1); + EXPECT_EQ(value2.val, 1); + EXPECT_EQ(value2.is_initial, false); + + functor.init(value2); + functor.join(value2, value1); + EXPECT_EQ(value2.val, 0); + EXPECT_EQ(value2.is_initial, true); + + value1.val = 3; + value1.is_initial = false; + value2.val = 2; + value2.is_initial = false; + functor.join(value2, value1); + EXPECT_EQ(value2.val, 6); + EXPECT_EQ(value2.is_initial, false); +} + } // namespace TransformEScan } // namespace stdalgos } // namespace Test diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index 80ff813251..095e490aa5 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -306,6 +306,73 @@ TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan) { } #endif +template +struct MultiplyFunctor { + KOKKOS_INLINE_FUNCTION + ValueType operator()(const ValueType& a, const ValueType& b) const { + return (a * b); + } +}; + +TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan_functor) { + using value_type = KE::Impl::ValueWrapperForNoNeutralElement; + + auto test_lambda = [&](auto& functor) { + value_type value1; + functor.init(value1); + EXPECT_EQ(value1.val, 0); + EXPECT_EQ(value1.is_initial, true); + + value_type value2; + value2.val = 1; + value2.is_initial = false; + functor.join(value1, value2); + EXPECT_EQ(value1.val, 1); + EXPECT_EQ(value1.is_initial, false); + + functor.init(value1); + functor.join(value2, value1); + EXPECT_EQ(value2.val, 1); + EXPECT_EQ(value2.is_initial, false); + + functor.init(value2); + functor.join(value2, value1); + EXPECT_EQ(value2.val, 0); + EXPECT_EQ(value2.is_initial, true); + + value1.val = 3; + value1.is_initial = false; + value2.val = 2; + value2.is_initial = false; + functor.join(value2, value1); + EXPECT_EQ(value2.val, 6); + EXPECT_EQ(value2.is_initial, false); + }; + + int dummy = 0; + using view_type = Kokkos::View; + view_type dummy_view("dummy_view", 0); + using unary_op_type = + KE::Impl::StdNumericScanIdentityReferenceUnaryFunctor; + { + using functor_type = KE::Impl::TransformInclusiveScanNoInitValueFunctor< + exespace, int, int, view_type, view_type, MultiplyFunctor, + unary_op_type>; + functor_type functor(dummy_view, dummy_view, {}, {}); + + test_lambda(functor); + } + + { + using functor_type = KE::Impl::TransformInclusiveScanWithInitValueFunctor< + exespace, int, int, view_type, view_type, MultiplyFunctor, + unary_op_type>; + functor_type functor(dummy_view, dummy_view, {}, {}, dummy); + + test_lambda(functor); + } +} + } // namespace TransformIncScan } // namespace stdalgos } // namespace Test From 0c681edd0bf829f818129627c6124807c8e15638 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 20 Apr 2023 17:00:14 -0400 Subject: [PATCH 379/496] SYCL: Use in-order queue for SYCL+Cuda --- algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp | 3 --- algorithms/unit_tests/TestStdAlgorithmsRemove.cpp | 3 --- algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp | 3 --- .../unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp | 3 --- algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp | 3 --- algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp | 3 --- algorithms/unit_tests/TestStdAlgorithmsRotate.cpp | 3 --- algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp | 3 --- algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp | 3 --- algorithms/unit_tests/TestStdAlgorithmsUnique.cpp | 3 --- algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp | 3 --- containers/unit_tests/TestScatterView.hpp | 7 ------- core/perf_test/PerfTest_ExecSpacePartitioning.cpp | 3 --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 10 +++++++++- core/src/SYCL/Kokkos_SYCL_Space.cpp | 8 +------- core/unit_test/TestCrs.hpp | 6 ------ 16 files changed, 10 insertions(+), 57 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp index a1d6548267..e21d50f69b 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp @@ -270,9 +270,6 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, copy_if) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp b/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp index e075ca78e0..8832d71f95 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp @@ -195,9 +195,6 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, remove) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp index 59fd63a0b1..949f8f60c9 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp @@ -224,9 +224,6 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, remove_copy) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp index c4d6e99f2a..9dc1e4a7e1 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp @@ -208,9 +208,6 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, remove_copy_if) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp index 2e96f8727e..e9d15f29d8 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp @@ -192,9 +192,6 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, remove_if) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp index 548eb347b2..f481144e1c 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp @@ -216,9 +216,6 @@ void run_all_scenarios() { } TEST(std_algorithms_replace_ops_test, replace_if) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp b/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp index 4de968c07c..a5a6f99bac 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp @@ -234,9 +234,6 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, rotate) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp b/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp index f3e3fc6260..8e4ced9635 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp @@ -202,9 +202,6 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, shift_left) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp b/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp index d6b631ea7a..a1614be027 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp @@ -206,9 +206,6 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, shift_right) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp b/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp index 636e5f15ba..a810d31d82 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp @@ -273,9 +273,6 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, unique) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp index 9116ca263b..f609d8517e 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp @@ -322,9 +322,6 @@ void run_all_scenarios() { } TEST(std_algorithms_mod_seq_ops, unique_copy) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/containers/unit_tests/TestScatterView.hpp b/containers/unit_tests/TestScatterView.hpp index 2238a43c8a..c9ad65cc2b 100644 --- a/containers/unit_tests/TestScatterView.hpp +++ b/containers/unit_tests/TestScatterView.hpp @@ -758,9 +758,6 @@ void test_scatter_view(int64_t n) { } TEST(TEST_CATEGORY, scatterview) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif test_scatter_view( 10); @@ -801,10 +798,6 @@ TEST(TEST_CATEGORY, scatterview) { } TEST(TEST_CATEGORY, scatterview_devicetype) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif - using device_type = Kokkos::Device; diff --git a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index 7115661d7d..d2a3d0b823 100644 --- a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -154,8 +154,6 @@ struct FunctorTeamReduce { } }; -// skip for SYCL+Cuda -#if !(defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)) static void OverlapRangePolicy(benchmark::State& state) { int N = state.range(0); int M = state.range(1); @@ -697,6 +695,5 @@ static void OverlapTeamPolicy(benchmark::State& state) { BENCHMARK(OverlapTeamPolicy) ->ArgNames({"N", "M", "R"}) ->Args({20, 1'000'000, 10}); -#endif // skip for SYCL+Cuda } // namespace Test diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 24f38d7c06..ea7498565e 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -75,7 +75,15 @@ void SYCLInternal::initialize(const sycl::device& d) { Kokkos::Impl::throw_runtime_exception( "There was an asynchronous SYCL error!\n"); }; - initialize(sycl::queue{d, exception_handler}); + // FIXME SYCL+Cuda Apparently, submit_barrier doesn't quite work as expected + // for oneAPI 2023.0.0 on NVIDIA GPUs. +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + initialize( + sycl::queue{d, exception_handler, sycl::property::queue::in_order()}); +#else + initialize( + sycl::queue{d, exception_handler); +#endif } // FIXME_SYCL diff --git a/core/src/SYCL/Kokkos_SYCL_Space.cpp b/core/src/SYCL/Kokkos_SYCL_Space.cpp index dab4f4b3bf..62af720b7c 100644 --- a/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -40,13 +40,7 @@ void DeepCopySYCL(void* dst, const void* src, size_t n) { void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, const void* src, size_t n) { sycl::queue& q = *instance.impl_internal_space_instance()->m_queue; - // FIXME_SYCL memcpy doesn't respect submit_barrier which means that we need - // to actually fence the execution space to make sure the memcpy is properly - // enqueued when using out-of-order queues. -#ifndef KOKKOS_ARCH_INTEL_GPU - q.wait_and_throw(); -#endif - auto event = q.memcpy(dst, src, n); + auto event = q.memcpy(dst, src, n); q.ext_oneapi_submit_barrier(std::vector{event}); } diff --git a/core/unit_test/TestCrs.hpp b/core/unit_test/TestCrs.hpp index 9efebb8a54..34fc4d0514 100644 --- a/core/unit_test/TestCrs.hpp +++ b/core/unit_test/TestCrs.hpp @@ -174,9 +174,6 @@ void test_constructor(std::int32_t nrows) { } // anonymous namespace TEST(TEST_CATEGORY, crs_count_fill) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif test_count_fill(0); test_count_fill(1); test_count_fill(2); @@ -188,9 +185,6 @@ TEST(TEST_CATEGORY, crs_count_fill) { } TEST(TEST_CATEGORY, crs_copy_constructor) { -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() << "skipping for SYCL+Cuda"; -#endif test_constructor(0); test_constructor(1); test_constructor(2); From 94446348a8bc1ee6deadd1738a31d7dcdaa0d0b1 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Sat, 22 Apr 2023 09:52:18 -0400 Subject: [PATCH 380/496] perf_test is still not working --- core/perf_test/PerfTest_ExecSpacePartitioning.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index d2a3d0b823..7115661d7d 100644 --- a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -154,6 +154,8 @@ struct FunctorTeamReduce { } }; +// skip for SYCL+Cuda +#if !(defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)) static void OverlapRangePolicy(benchmark::State& state) { int N = state.range(0); int M = state.range(1); @@ -695,5 +697,6 @@ static void OverlapTeamPolicy(benchmark::State& state) { BENCHMARK(OverlapTeamPolicy) ->ArgNames({"N", "M", "R"}) ->Args({20, 1'000'000, 10}); +#endif // skip for SYCL+Cuda } // namespace Test From 06dbc151d3bb9807302ad24697d6a34c45f669ce Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 24 Apr 2023 15:16:39 +0000 Subject: [PATCH 381/496] Fix PerfTests by limiting GramSchmidt --- core/perf_test/PerfTestGramSchmidt.cpp | 11 +++++++++++ core/perf_test/PerfTest_ExecSpacePartitioning.cpp | 3 --- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/core/perf_test/PerfTestGramSchmidt.cpp b/core/perf_test/PerfTestGramSchmidt.cpp index 949cc07e6b..ddfa73d4ba 100644 --- a/core/perf_test/PerfTestGramSchmidt.cpp +++ b/core/perf_test/PerfTestGramSchmidt.cpp @@ -175,6 +175,16 @@ static void GramSchmidt(benchmark::State& state) { } } +// FIXME_SYCL SYCL+Cuda reports "an illegal memory access was encountered" +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) +BENCHMARK(GramSchmidt) + ->ArgName("Count") + ->ArgsProduct({ + benchmark::CreateRange(1 << 10, 1 << 18, 2), + }) + ->UseManualTime() + ->Iterations(5); +#else BENCHMARK(GramSchmidt) ->ArgName("Count") ->ArgsProduct({ @@ -182,5 +192,6 @@ BENCHMARK(GramSchmidt) }) ->UseManualTime() ->Iterations(5); +#endif } // namespace Test diff --git a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index 7115661d7d..d2a3d0b823 100644 --- a/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -154,8 +154,6 @@ struct FunctorTeamReduce { } }; -// skip for SYCL+Cuda -#if !(defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)) static void OverlapRangePolicy(benchmark::State& state) { int N = state.range(0); int M = state.range(1); @@ -697,6 +695,5 @@ static void OverlapTeamPolicy(benchmark::State& state) { BENCHMARK(OverlapTeamPolicy) ->ArgNames({"N", "M", "R"}) ->Args({20, 1'000'000, 10}); -#endif // skip for SYCL+Cuda } // namespace Test From a92e091631c26dde323cc7f1d1581b71f0b11d98 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 11 Apr 2023 17:35:07 +0000 Subject: [PATCH 382/496] Only pass one wrapper object in SYCL reductions --- core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp | 100 +++++++++--------- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 51 +++++---- 2 files changed, 75 insertions(+), 76 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp index 4bdedc64e1..0e2fee1da8 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp @@ -209,12 +209,11 @@ class ParallelReduce, p.space().impl_internal_space_instance()->m_mutexScratchSpace) {} private: - template + template sycl::event sycl_direct_launch( - const PolicyType& policy, const FunctorWrapper& functor_wrapper, - const ReducerWrapper& reducer_wrapper, - const std::vector& memcpy_events) const { + const PolicyType& policy, + const CombinedFunctorReducerWrapper& functor_reducer_wrapper, + const sycl::event& memcpy_event) const { // Convenience references const Kokkos::Experimental::SYCL& space = policy.space(); Kokkos::Experimental::Impl::SYCLInternal& instance = @@ -241,11 +240,13 @@ class ParallelReduce, auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { const auto begin = policy.begin(); - cgh.depends_on(memcpy_events); + cgh.depends_on(memcpy_event); cgh.single_task([=]() { - const FunctorType& functor = functor_wrapper.get_functor(); - const ReducerType& reducer = reducer_wrapper.get_functor(); - reference_type update = reducer.init(results_ptr); + const FunctorType& functor = + functor_reducer_wrapper.get_functor().get_functor(); + const ReducerType& reducer = + functor_reducer_wrapper.get_functor().get_reducer(); + reference_type update = reducer.init(results_ptr); if (size == 1) { if constexpr (std::is_void_v) functor(begin, update); @@ -284,8 +285,10 @@ class ParallelReduce, const auto global_id = wgroup_size * item.get_group_linear_id() * values_per_thread + local_id; - const FunctorType& functor = functor_wrapper.get_functor(); - const ReducerType& reducer = reducer_wrapper.get_functor(); + const FunctorType& functor = + functor_reducer_wrapper.get_functor().get_functor(); + const ReducerType& reducer = + functor_reducer_wrapper.get_functor().get_reducer(); using index_type = typename Policy::index_type; const auto upper_bound = std::min( @@ -423,7 +426,7 @@ class ParallelReduce, sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh); - cgh.depends_on(memcpy_events); + cgh.depends_on(memcpy_event); auto reduction_lambda = reduction_lambda_factory(local_mem, num_teams_done, results_ptr); @@ -455,19 +458,16 @@ class ParallelReduce, *m_policy.space().impl_internal_space_instance(); using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; - IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); - IndirectKernelMem& indirectReducerMem = instance.get_indirect_kernel_mem(); - - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor_reducer.get_functor(), indirectKernelMem); - auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor_reducer.get_reducer(), indirectReducerMem); - - sycl::event event = sycl_direct_launch( - m_policy, functor_wrapper, reducer_wrapper, - {functor_wrapper.get_copy_event(), reducer_wrapper.get_copy_event()}); - functor_wrapper.register_event(event); - reducer_wrapper.register_event(event); + IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); + + auto functor_reducer_wrapper = + Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, + indirectKernelMem); + + sycl::event event = + sycl_direct_launch(m_policy, functor_reducer_wrapper, + functor_reducer_wrapper.get_copy_event()); + functor_reducer_wrapper.register_event(event); } private: @@ -536,12 +536,11 @@ class ParallelReducem_mutexScratchSpace) {} private: - template + template sycl::event sycl_direct_launch( - const PolicyType& policy, const FunctorWrapper& functor_wrapper, - const ReducerWrapper& reducer_wrapper, - const std::vector& memcpy_events) const { + const PolicyType& policy, + const CombinedFunctorReducerWrapper& functor_reducer_wrapper, + const sycl::event& memcpy_event) const { // Convenience references Kokkos::Experimental::Impl::SYCLInternal& instance = *m_space.impl_internal_space_instance(); @@ -577,10 +576,12 @@ class ParallelReduce item) { - const auto local_id = item.get_local_linear_id(); - const FunctorType& functor = functor_wrapper.get_functor(); - const ReducerType& reducer = reducer_wrapper.get_functor(); + const auto local_id = item.get_local_linear_id(); + const FunctorType& functor = + functor_reducer_wrapper.get_functor().get_functor(); + const ReducerType& reducer = + functor_reducer_wrapper.get_functor().get_reducer(); // In the first iteration, we call functor to initialize the local // memory. Otherwise, the local memory is initialized with the @@ -751,19 +754,16 @@ class ParallelReduce, template sycl::event sycl_direct_launch(const Policy& policy, const FunctorWrapper& functor_wrapper, - const sycl::event& memcpy_events) const { + const sycl::event& memcpy_event) const { // Convenience references const Kokkos::Experimental::SYCL& space = policy.space(); sycl::queue& q = space.sycl_queue(); @@ -431,7 +431,7 @@ class ParallelFor, // be used gives a runtime error. // cgh.use_kernel_bundle(kernel_bundle); - cgh.depends_on(memcpy_events); + cgh.depends_on(memcpy_event); cgh.parallel_for( sycl::nd_range<2>( sycl::range<2>(m_team_size, m_league_size * final_vector_size), @@ -551,12 +551,11 @@ class ParallelReduce m_scratch_lock; int m_scratch_pool_id = -1; - template + template sycl::event sycl_direct_launch( - const PolicyType& policy, const FunctorWrapper& functor_wrapper, - const ReducerWrapper& reducer_wrapper, - const std::vector& memcpy_events) const { + const PolicyType& policy, + const CombinedFunctorReducerWrapper& functor_reducer_wrapper, + const sycl::event& memcpy_event) const { // Convenience references const Kokkos::Experimental::SYCL& space = policy.space(); Kokkos::Experimental::Impl::SYCLInternal& instance = @@ -593,12 +592,14 @@ class ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; - cgh.depends_on(memcpy_events); + cgh.depends_on(memcpy_event); cgh.parallel_for( sycl::nd_range<2>(sycl::range<2>(1, 1), sycl::range<2>(1, 1)), [=](sycl::nd_item<2> item) { - const FunctorType& functor = functor_wrapper.get_functor(); - const ReducerType& reducer = reducer_wrapper.get_functor(); + const FunctorType& functor = + functor_reducer_wrapper.get_functor().get_functor(); + const ReducerType& reducer = + functor_reducer_wrapper.get_functor().get_reducer(); reference_type update = reducer.init(results_ptr); if (size == 1) { @@ -653,9 +654,11 @@ class ParallelReduce( local_mem[wgroup_size * std::max(value_count, 1u)]); - const auto local_id = item.get_local_linear_id(); - const FunctorType& functor = functor_wrapper.get_functor(); - const ReducerType& reducer = reducer_wrapper.get_functor(); + const auto local_id = item.get_local_linear_id(); + const FunctorType& functor = + functor_reducer_wrapper.get_functor().get_functor(); + const ReducerType& reducer = + functor_reducer_wrapper.get_functor().get_reducer(); if constexpr (ReducerType::static_value_size() == 0) { reference_type update = @@ -791,7 +794,7 @@ class ParallelReduce( @@ -822,20 +825,16 @@ class ParallelReduce Date: Mon, 24 Apr 2023 20:09:14 +0000 Subject: [PATCH 383/496] Explicitly cast to CombinedFunctorReducerType --- core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp | 34 +++++++++---------- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 16 ++++----- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp index 0e2fee1da8..a29e8010d8 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp @@ -242,11 +242,11 @@ class ParallelReduce, const auto begin = policy.begin(); cgh.depends_on(memcpy_event); cgh.single_task([=]() { - const FunctorType& functor = - functor_reducer_wrapper.get_functor().get_functor(); - const ReducerType& reducer = - functor_reducer_wrapper.get_functor().get_reducer(); - reference_type update = reducer.init(results_ptr); + const CombinedFunctorReducerType& functor_reducer = + functor_reducer_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const ReducerType& reducer = functor_reducer.get_reducer(); + reference_type update = reducer.init(results_ptr); if (size == 1) { if constexpr (std::is_void_v) functor(begin, update); @@ -285,10 +285,10 @@ class ParallelReduce, const auto global_id = wgroup_size * item.get_group_linear_id() * values_per_thread + local_id; - const FunctorType& functor = - functor_reducer_wrapper.get_functor().get_functor(); - const ReducerType& reducer = - functor_reducer_wrapper.get_functor().get_reducer(); + const CombinedFunctorReducerType& functor_reducer = + functor_reducer_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const ReducerType& reducer = functor_reducer.get_reducer(); using index_type = typename Policy::index_type; const auto upper_bound = std::min( @@ -578,10 +578,10 @@ class ParallelReduce item) { const auto local_id = item.get_local_linear_id(); - const FunctorType& functor = - functor_reducer_wrapper.get_functor().get_functor(); - const ReducerType& reducer = - functor_reducer_wrapper.get_functor().get_reducer(); + const CombinedFunctorReducerType& functor_reducer = + functor_reducer_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const ReducerType& reducer = functor_reducer.get_reducer(); // In the first iteration, we call functor to initialize the local // memory. Otherwise, the local memory is initialized with the diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp index c11e37f9b8..b543e94a0b 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -596,10 +596,10 @@ class ParallelReduce(sycl::range<2>(1, 1), sycl::range<2>(1, 1)), [=](sycl::nd_item<2> item) { - const FunctorType& functor = - functor_reducer_wrapper.get_functor().get_functor(); - const ReducerType& reducer = - functor_reducer_wrapper.get_functor().get_reducer(); + const CombinedFunctorReducerType& functor_reducer = + functor_reducer_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const ReducerType& reducer = functor_reducer.get_reducer(); reference_type update = reducer.init(results_ptr); if (size == 1) { @@ -655,10 +655,10 @@ class ParallelReduce( local_mem[wgroup_size * std::max(value_count, 1u)]); const auto local_id = item.get_local_linear_id(); - const FunctorType& functor = - functor_reducer_wrapper.get_functor().get_functor(); - const ReducerType& reducer = - functor_reducer_wrapper.get_functor().get_reducer(); + const CombinedFunctorReducerType& functor_reducer = + functor_reducer_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const ReducerType& reducer = functor_reducer.get_reducer(); if constexpr (ReducerType::static_value_size() == 0) { reference_type update = From 4b27b7d9a0f47505a6a491781cc2e519fb6e705d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 26 Apr 2023 15:09:34 -0400 Subject: [PATCH 384/496] Fix Kokkos_SIMD with AVX2 on 64-bit architectures (#6075) * Fix Kokkos_SIMD with AVX2 on 64-bit architectures * Restore value_type for mask type --- simd/src/Kokkos_SIMD_AVX2.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index 86b944efa5..eacbfa2393 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -680,6 +680,8 @@ template <> class simd> { __m256i m_value; + static_assert(sizeof(long long) == 8); + public: using value_type = std::int64_t; using abi_type = simd_abi::avx2_fixed_size<4>; @@ -727,11 +729,13 @@ class simd> { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { - m_value = _mm256_maskload_epi64(ptr, static_cast<__m256i>(mask_type(true))); + m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { - _mm256_maskstore_epi64(ptr, static_cast<__m256i>(mask_type(true)), m_value); + _mm256_maskstore_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true)), m_value); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { From b3bb4a6cf72042545e7256d9eb84020fe01bdc47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 26 Apr 2023 22:08:13 +0200 Subject: [PATCH 385/496] Update changelog (#6058) * Update changelog to 4.0.01 [ci skip] * fix based on comments --------- Co-authored-by: Francesco Rizzi --- CHANGELOG.md | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03cf75ff88..8045a5c0a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,37 @@ # Change Log +## [4.0.01](https://github.com/kokkos/kokkos/tree/4.0.01) (2023-04-14) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.0.01) + +### Backend and Architecture Enhancements: + +#### CUDA: + +- Allow NVCC 12 to compile using C++20 flag [\#6020](https://github.com/kokkos/kokkos/pull/6020) +- Add CUDA Ada architecture support [\#6022](https://github.com/kokkos/kokkos/pull/6022) + +#### HIP: + +- Add support for AMDGPU target NAVI31 / RX 7900 XT(X): gfx1100 [\#6021](https://github.com/kokkos/kokkos/pull/6021) +- HIP: Fix warning from `std::memcpy` [\#6019](https://github.com/kokkos/kokkos/pull/6019) + +#### SYCL: +- Fix `SYCLTeamMember` to take arguments for scratch sizes as `std::size_t` [\#5986](https://github.com/kokkos/kokkos/pull/5986) + +### General Enhancements +- Fixup 4.0 change log [\#6023](https://github.com/kokkos/kokkos/pull/6023) + +### Build System Changes +- Cherry-pick TriBITS update from Trilinos [\#6037](https://github.com/kokkos/kokkos/pull/6037) +- CMake: update package compatibility mode when building within Trilinos [\#6013](https://github.com/kokkos/kokkos/pull/6013) + +### Bug Fixes +- Fix an incorrectly returning size for SIMD uint64_t in AVX2 [\#6011](https://github.com/kokkos/kokkos/pull/6011) +- Desul atomics: wrong value for `desul::Impl::numeric_limits_max` [\#6018](https://github.com/kokkos/kokkos/pull/6018) +- Fix warning in some user code when using std::memcpy [\#6000](https://github.com/kokkos/kokkos/pull/6000) + ## [4.0.0](https://github.com/kokkos/kokkos/tree/4.0.0) (2023-02-21) -[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.01...4.0.0) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.01...4.0.00) ### Features: - Allow value types without default constructor in `Kokkos::View` with `Kokkos::WithoutInitializing` [\#5307](https://github.com/kokkos/kokkos/pull/5307) From d5fa56e1d0d860dbc720091ab2f5c6115363976a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 27 Apr 2023 15:20:27 +0000 Subject: [PATCH 386/496] Fix up SYCL execution space instance creation for Intel GPUs --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index ea7498565e..7f33a4f948 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -81,8 +81,7 @@ void SYCLInternal::initialize(const sycl::device& d) { initialize( sycl::queue{d, exception_handler, sycl::property::queue::in_order()}); #else - initialize( - sycl::queue{d, exception_handler); + initialize(sycl::queue{d, exception_handler}); #endif } From 55bbd9f175307a2ad42d205a3cebb5a32698545b Mon Sep 17 00:00:00 2001 From: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> Date: Thu, 27 Apr 2023 20:57:47 -0600 Subject: [PATCH 387/496] Converted a shared_ptr to a host view in UnorderedMap (#6073) * Converted a shared_ptr in UnorderedMap to Kokkos::View * Apply suggestions from code review Co-authored-by: Daniel Arndt * quick adjustments * Ifdef'ed a lambda capture test out when cuda_lambda is set to be off * Changed back to using default execution space for the test unordered map * Applying comments from reviews * clang formatted * Fixed to use the correct execution space * Update containers/unit_tests/TestUnorderedMap.hpp Co-authored-by: Damien L-G * Specified execution policy type for rangepolicy --------- Co-authored-by: Daniel Arndt Co-authored-by: Dong Hun Lee Co-authored-by: Damien L-G --- containers/src/Kokkos_UnorderedMap.hpp | 12 +++---- containers/unit_tests/TestUnorderedMap.hpp | 39 ++++++++++++++++++++++ 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/containers/src/Kokkos_UnorderedMap.hpp b/containers/src/Kokkos_UnorderedMap.hpp index 377168fa55..aa62d32557 100644 --- a/containers/src/Kokkos_UnorderedMap.hpp +++ b/containers/src/Kokkos_UnorderedMap.hpp @@ -307,7 +307,7 @@ class UnorderedMap { : m_bounded_insert(true), m_hasher(hasher), m_equal_to(equal_to), - m_size(std::make_shared()), + m_size("UnorderedMap size"), m_available_indexes(calculate_capacity(capacity_hint)), m_hash_lists(view_alloc(WithoutInitializing, "UnorderedMap hash list"), Impl::find_hash_size(capacity())), @@ -347,7 +347,7 @@ class UnorderedMap { Kokkos::deep_copy(m_keys, tmp); } Kokkos::deep_copy(m_scalars, 0); - *m_size = 0; + m_size() = 0; } KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { @@ -401,10 +401,10 @@ class UnorderedMap { size_type size() const { if (capacity() == 0u) return 0u; if (modified()) { - *m_size = m_available_indexes.count(); + m_size() = m_available_indexes.count(); reset_flag(modified_idx); } - return *m_size; + return m_size(); } /// \brief The current number of failed insert() calls. @@ -769,7 +769,7 @@ class UnorderedMap { tmp.m_bounded_insert = src.m_bounded_insert; tmp.m_hasher = src.m_hasher; tmp.m_equal_to = src.m_equal_to; - *tmp.m_size = *src.m_size; + tmp.m_size() = src.m_size(); tmp.m_available_indexes = bitset_type(src.capacity()); tmp.m_hash_lists = size_type_view( view_alloc(WithoutInitializing, "UnorderedMap hash list"), @@ -862,7 +862,7 @@ class UnorderedMap { bool m_bounded_insert; hasher_type m_hasher; equal_to_type m_equal_to; - std::shared_ptr m_size; + View m_size; bitset_type m_available_indexes; size_type_view m_hash_lists; size_type_view m_next_index; diff --git a/containers/unit_tests/TestUnorderedMap.hpp b/containers/unit_tests/TestUnorderedMap.hpp index c2d941b5bf..296c9f42ac 100644 --- a/containers/unit_tests/TestUnorderedMap.hpp +++ b/containers/unit_tests/TestUnorderedMap.hpp @@ -480,6 +480,45 @@ TEST(TEST_CATEGORY, UnorderedMap_consistent_size) { ASSERT_EQ(2u, m.size()); } +struct TestMapCopy { + using map_type = Kokkos::UnorderedMap; + map_type m_map; + + KOKKOS_FUNCTION + void test_insert_to_map_copy(map_type const &input_map, const int i) const { + auto map = input_map; + map.insert(i); + } + + KOKKOS_FUNCTION + void operator()(const int i) const { test_insert_to_map_copy(m_map, i); } +}; + +TEST(TEST_CATEGORY, UnorderedMap_shallow_copyable_on_device) { + TestMapCopy test_map_copy; + + Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), + test_map_copy); + ASSERT_EQ(1u, test_map_copy.m_map.size()); +} + +#if !defined(KOKKOS_ENABLE_CUDA) || \ + (defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_LAMBDA)) +void test_unordered_map_device_capture() { + TestMapCopy::map_type map; + + Kokkos::parallel_for( + Kokkos::RangePolicy(0, 1), + KOKKOS_LAMBDA(int const i) { map.insert(i); }); + + ASSERT_EQ(1u, map.size()); +} + +TEST(TEST_CATEGORY, UnorderedMap_lambda_capturable) { + test_unordered_map_device_capture(); +} +#endif + } // namespace Test #endif // KOKKOS_TEST_UNORDERED_MAP_HPP From de5c017eaf0111fbc81369ec4a23fe4ee7450c45 Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Fri, 28 Apr 2023 13:52:24 -0400 Subject: [PATCH 388/496] Update OpenACC FunctorAdapter (#6077) * Update OpenACC FunctorAdapter to be able to handle functors containing parallel loops. * Update OpenACC FunctorAdapter implementation to use enum types for template specialization. * Minor revision as suggested by the code review. --- .../OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp | 45 ++++++++++++------- .../Kokkos_OpenACC_ParallelFor_MDRange.hpp | 4 +- .../Kokkos_OpenACC_ParallelFor_Range.hpp | 4 +- .../Kokkos_OpenACC_ParallelFor_Team.hpp | 7 ++- .../Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 4 +- .../Kokkos_OpenACC_ParallelReduce_Range.hpp | 4 +- .../Kokkos_OpenACC_ParallelReduce_Team.hpp | 11 ++++- .../Kokkos_OpenACC_ParallelScan_Range.hpp | 5 ++- 8 files changed, 59 insertions(+), 25 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp b/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp index 1325e61e1d..f2fd1a3619 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp @@ -17,27 +17,40 @@ #ifndef KOKKOS_OPENACC_FUNCTOR_ADAPTER_HPP #define KOKKOS_OPENACC_FUNCTOR_ADAPTER_HPP +#include #include namespace Kokkos::Experimental::Impl { -template -class FunctorAdapter { - Functor m_functor; - using WorkTag = typename Policy::work_tag; - - public: - FunctorAdapter(Functor const &functor) : m_functor(functor) {} - - template - KOKKOS_FUNCTION void operator()(Args &&... args) const { - if constexpr (std::is_void_v) { - m_functor(static_cast(args)...); - } else { - m_functor(WorkTag(), static_cast(args)...); - } +enum class RoutineClause { worker, seq }; + +template +class FunctorAdapter; + +#define KOKKOS_IMPL_ACC_FUNCTOR_ADAPTER(CLAUSE) \ + template \ + class FunctorAdapter { \ + Functor m_functor; \ + using WorkTag = typename Policy::work_tag; \ + \ + public: \ + FunctorAdapter(Functor const &functor) : m_functor(functor) {} \ + \ + KOKKOS_IMPL_ACC_PRAGMA_HELPER(routine CLAUSE) \ + template \ + KOKKOS_FUNCTION void operator()(Args &&... args) const { \ + if constexpr (std::is_void_v) { \ + m_functor(static_cast(args)...); \ + } else { \ + m_functor(WorkTag(), static_cast(args)...); \ + } \ + } \ } -}; + +KOKKOS_IMPL_ACC_FUNCTOR_ADAPTER(worker); +KOKKOS_IMPL_ACC_FUNCTOR_ADAPTER(seq); + +#undef KOKKOS_IMPL_ACC_FUNCTOR_ADAPTER } // namespace Kokkos::Experimental::Impl diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp index a55a18bc24..550436fe7b 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp @@ -640,7 +640,9 @@ template class Kokkos::Impl::ParallelFor, Kokkos::Experimental::OpenACC> { using Policy = MDRangePolicy; - Kokkos::Experimental::Impl::FunctorAdapter m_functor; + Kokkos::Experimental::Impl::FunctorAdapter< + Functor, Policy, Kokkos::Experimental::Impl::RoutineClause::seq> + m_functor; Policy m_policy; public: diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Range.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Range.hpp index ede93ec19e..6ddfc352fc 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Range.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Range.hpp @@ -78,7 +78,9 @@ template class Kokkos::Impl::ParallelFor, Kokkos::Experimental::OpenACC> { using Policy = Kokkos::RangePolicy; - Kokkos::Experimental::Impl::FunctorAdapter m_functor; + Kokkos::Experimental::Impl::FunctorAdapter< + Functor, Policy, Kokkos::Experimental::Impl::RoutineClause::seq> + m_functor; Policy m_policy; using ScheduleType = Kokkos::Experimental::Impl::OpenACCScheduleType; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp index 1dc7b28912..b5cf670791 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp @@ -31,7 +31,9 @@ class Kokkos::Impl::ParallelFor, private: using Policy = Kokkos::Impl::TeamPolicyInternal; - Kokkos::Experimental::Impl::FunctorAdapter m_functor; + Kokkos::Experimental::Impl::FunctorAdapter< + FunctorType, Policy, Kokkos::Experimental::Impl::RoutineClause::seq> + m_functor; using Member = typename Policy::member_type; const Policy m_policy; @@ -130,7 +132,8 @@ class Kokkos::Impl::ParallelFor, private: using Policy = Kokkos::Impl::TeamPolicyInternal; - Kokkos::Experimental::Impl::FunctorAdapter m_functor; + Kokkos::Experimental::Impl::FunctorAdapter + m_functor; using Member = typename Policy::member_type; const Policy m_policy; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 121a2cfe3f..0ebd8b219f 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -76,7 +76,9 @@ class Kokkos::Impl::ParallelReduce( + Kokkos::Experimental::Impl::FunctorAdapter< + FunctorType, Policy, + Kokkos::Experimental::Impl::RoutineClause::seq>( m_functor_reducer.get_functor()), std::conditional_t< std::is_same_v, diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp index 30f4797d83..e70b8997f0 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp @@ -74,7 +74,9 @@ class Kokkos::Impl::ParallelReduce( + Kokkos::Experimental::Impl::FunctorAdapter< + FunctorType, Policy, + Kokkos::Experimental::Impl::RoutineClause::seq>( m_functor_reducer.get_functor()), std::conditional_t< std::is_same_v, diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index 4276f0f167..d572072aba 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -21,6 +21,14 @@ #include #include +#ifdef KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS +#define KOKKOS_IMPL_OPENACC_LOOP_CLAUSE \ + Kokkos::Experimental::Impl::RoutineClause::seq +#else +#define KOKKOS_IMPL_OPENACC_LOOP_CLAUSE \ + Kokkos::Experimental::Impl::RoutineClause::worker +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- // Hierarchical Parallelism -> Team level implementation @@ -67,7 +75,8 @@ class Kokkos::Impl::ParallelReduce( + Kokkos::Experimental::Impl::FunctorAdapter< + FunctorType, Policy, KOKKOS_IMPL_OPENACC_LOOP_CLAUSE>( m_functor_reducer.get_functor()), std::conditional_t< std::is_same_v, diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp index 82401fd021..56f9db0db8 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp @@ -63,8 +63,9 @@ class Kokkos::Impl::ParallelScan, } else { chunk_size = default_scan_chunk_size; } - const Kokkos::Experimental::Impl::FunctorAdapter functor( - m_functor); + const Kokkos::Experimental::Impl::FunctorAdapter< + Functor, Policy, Kokkos::Experimental::Impl::RoutineClause::seq> + functor(m_functor); const IndexType N = end - begin; const IndexType n_chunks = (N + chunk_size - 1) / chunk_size; Kokkos::View chunk_values( From 3cc9915feb81632b6f73c21a89eaaab791894cf0 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 20 Apr 2023 17:00:14 -0400 Subject: [PATCH 389/496] Improve SYCL parallel_scan --- core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp | 234 ++++++++++---------- 1 file changed, 116 insertions(+), 118 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp index 5176c0f14e..6e8b67c011 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp @@ -32,21 +32,22 @@ namespace Impl { template void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, sycl::local_accessor local_mem, - ValueType& local_value, unsigned int global_range) { + ValueType& local_value, int global_range) { // subgroup scans - auto sg = item.get_sub_group(); - const auto sg_group_id = sg.get_group_id()[0]; - const auto id_in_sg = sg.get_local_id()[0]; - for (unsigned int stride = 1; stride < global_range; stride <<= 1) { + auto sg = item.get_sub_group(); + const int sg_group_id = sg.get_group_id()[0]; + const int id_in_sg = sg.get_local_id()[0]; + + for (int stride = 1; stride < global_range; stride <<= 1) { auto tmp = sg.shuffle_up(local_value, stride); if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); } - const auto max_subgroup_size = sg.get_max_local_range()[0]; - const auto n_active_subgroups = + const int max_subgroup_size = sg.get_max_local_range()[0]; + const int n_active_subgroups = (global_range + max_subgroup_size - 1) / max_subgroup_size; - const auto local_range = sg.get_local_range()[0]; + const int local_range = sg.get_local_range()[0]; if (id_in_sg == local_range - 1 && sg_group_id < n_active_subgroups) local_mem[sg_group_id] = local_value; local_value = sg.shuffle_up(local_value, 1); @@ -56,14 +57,13 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, // scan subgroup results using the first subgroup if (n_active_subgroups > 1) { if (sg_group_id == 0) { - const auto n_rounds = - (n_active_subgroups + local_range - 1) / local_range; - for (unsigned int round = 0; round < n_rounds; ++round) { - const unsigned int idx = id_in_sg + round * local_range; + const int n_rounds = (n_active_subgroups + local_range - 1) / local_range; + for (int round = 0; round < n_rounds; ++round) { + const int idx = id_in_sg + round * local_range; const auto upper_bound = std::min(local_range, n_active_subgroups - round * local_range); auto local_sg_value = local_mem[idx < n_active_subgroups ? idx : 0]; - for (unsigned int stride = 1; stride < upper_bound; stride <<= 1) { + for (int stride = 1; stride < upper_bound; stride <<= 1) { auto tmp = sg.shuffle_up(local_sg_value, stride); if (id_in_sg >= stride) { if (idx < n_active_subgroups) @@ -123,14 +123,29 @@ class ParallelScanSYCLBase { private: template - void scan_internal(sycl::queue& q, const FunctorWrapper& functor_wrapper, - pointer_type global_mem, std::size_t size) const { + sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, + sycl::event memcpy_event) const { + // Convenience references + const Kokkos::Experimental::SYCL& space = m_policy.space(); + Kokkos::Experimental::Impl::SYCLInternal& instance = + *space.impl_internal_space_instance(); + sycl::queue& q = space.sycl_queue(); + + const auto size = m_policy.end() - m_policy.begin(); + // FIXME_SYCL optimize constexpr size_t wgroup_size = 128; auto n_wgroups = (size + wgroup_size - 1) / wgroup_size; + auto global_mem = m_scratch_space; pointer_type group_results = global_mem + n_wgroups * wgroup_size; - auto local_scans = q.submit([&](sycl::handler& cgh) { + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); + + // Initialize global memory + auto initialize_global_memory = q.submit([&](sycl::handler& cgh) { + auto begin = m_policy.begin(); + // Store subgroup totals const auto min_subgroup_size = q.get_device() @@ -140,6 +155,9 @@ class ParallelScanSYCLBase { sycl::range<1>((wgroup_size + min_subgroup_size - 1) / min_subgroup_size), cgh); + sycl::local_accessor num_teams_done(1, cgh); + + cgh.depends_on(memcpy_event); cgh.parallel_for( sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), @@ -147,92 +165,66 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer< FunctorType, typename Analysis::Reducer>& functor_reducer = functor_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); const typename Analysis::Reducer& reducer = functor_reducer.get_reducer(); - const auto local_id = item.get_local_linear_id(); - const auto global_id = item.get_global_linear_id(); + const index_type local_id = item.get_local_linear_id(); + const index_type global_id = item.get_global_linear_id(); // Initialize local memory value_type local_value; - if (global_id < size) - local_value = global_mem[global_id]; - else - reducer.init(&local_value); + reducer.init(&local_value); + if (global_id < size) { + if constexpr (std::is_void::value) + functor(global_id + begin, local_value, false); + else + functor(WorkTag(), global_id + begin, local_value, false); + } workgroup_scan<>(item, reducer, local_mem, local_value, wgroup_size); - if (n_wgroups > 1 && local_id == wgroup_size - 1) - group_results[item.get_group_linear_id()] = - local_mem[item.get_sub_group().get_group_range()[0] - 1]; - // Write results to global memory if (global_id < size) global_mem[global_id] = local_value; - }); - }); - q.ext_oneapi_submit_barrier(std::vector{local_scans}); - - if (n_wgroups > 1) { - scan_internal(q, functor_wrapper, group_results, n_wgroups); - auto update_with_group_results = q.submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), - [=](sycl::nd_item<1> item) { - const auto global_id = item.get_global_linear_id(); - const CombinedFunctorReducer - functor_reducer = functor_wrapper.get_functor(); - const typename Analysis::Reducer& reducer = - functor_reducer.get_reducer(); - if (global_id < size) - reducer.join(&global_mem[global_id], - &group_results[item.get_group_linear_id()]); - }); - }); - q.ext_oneapi_submit_barrier( - std::vector{update_with_group_results}); - } - } - template - sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, - sycl::event memcpy_event) const { - // Convenience references - const Kokkos::Experimental::SYCL& space = m_policy.space(); - sycl::queue& q = space.sycl_queue(); - - const std::size_t len = m_policy.end() - m_policy.begin(); - - // Initialize global memory - auto initialize_global_memory = q.submit([&](sycl::handler& cgh) { - auto global_mem = m_scratch_space; - auto begin = m_policy.begin(); + if (local_id == wgroup_size - 1) { + group_results[item.get_group_linear_id()] = + local_mem[item.get_sub_group().get_group_range()[0] - 1]; - cgh.depends_on(memcpy_event); - cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) { - const typename Policy::index_type id = - static_cast(item.get_id()) + begin; - const CombinedFunctorReducer& - functor_reducer = functor_wrapper.get_functor(); - const typename Analysis::Reducer& reducer = - functor_reducer.get_reducer(); - - value_type update{}; - reducer.init(&update); - const FunctorType& functor = functor_reducer.get_functor(); - if constexpr (std::is_void::value) - functor(id, update, false); - else - functor(WorkTag(), id, update, false); - global_mem[id] = update; - }); + sycl::atomic_ref + scratch_flags_ref(*scratch_flags); + num_teams_done[0] = ++scratch_flags_ref; + } + item.barrier(sycl::access::fence_space::global_space); + if (num_teams_done[0] == n_wgroups) { + value_type total; + reducer.init(&total); + + for (unsigned int offset = 0; offset < n_wgroups; + offset += wgroup_size) { + index_type id = local_id + offset; + if (id < static_cast(n_wgroups)) + local_value = group_results[id]; + else + reducer.init(&local_value); + workgroup_scan<>(item, reducer, local_mem, local_value, + std::min(n_wgroups - offset, wgroup_size)); + if (id < static_cast(n_wgroups)) { + reducer.join(&local_value, &total); + group_results[id] = local_value; + } + reducer.join( + &total, + &local_mem[item.get_sub_group().get_group_range()[0] - 1]); + if (offset + wgroup_size < n_wgroups) + item.barrier(sycl::access::fence_space::global_space); + } + } + }); }); - q.ext_oneapi_submit_barrier( - std::vector{initialize_global_memory}); - - // Perform the actual exclusive scan - scan_internal(q, functor_wrapper, m_scratch_space, len); // Write results to global memory auto update_global_results = q.submit([&](sycl::handler& cgh) { @@ -241,21 +233,36 @@ class ParallelScanSYCLBase { // The compiler failed with CL_INVALID_ARG_VALUE if using m_result_ptr // directly. auto result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr; - cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) { - auto global_id = item.get_id(0); - - value_type update = global_mem[global_id]; - const CombinedFunctorReducer& - functor_reducer = functor_wrapper.get_functor(); - const FunctorType& functor = functor_reducer.get_functor(); - if constexpr (std::is_void::value) - functor(global_id, update, true); - else - functor(WorkTag(), global_id, update, true); - global_mem[global_id] = update; - if (global_id == len - 1 && result_ptr_device_accessible) - *result_ptr = update; - }); + auto begin = m_policy.begin(); + + cgh.depends_on(initialize_global_memory); + + cgh.parallel_for( + sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), + [=](sycl::nd_item<1> item) { + const index_type global_id = item.get_global_linear_id(); + const CombinedFunctorReducer< + FunctorType, typename Analysis::Reducer>& functor_reducer = + functor_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const typename Analysis::Reducer& reducer = + functor_reducer.get_reducer(); + + if (global_id < size) { + value_type update = global_mem[global_id]; + + reducer.join(&update, &group_results[item.get_group_linear_id()]); + + if constexpr (std::is_void::value) + functor(global_id + begin, update, true); + else + functor(WorkTag(), global_id + begin, update, true); + + global_mem[global_id] = update; + if (global_id == size - 1 && result_ptr_device_accessible) + *result_ptr = update; + } + }); }); q.ext_oneapi_submit_barrier( std::vector{update_global_results}); @@ -270,22 +277,13 @@ class ParallelScanSYCLBase { auto& instance = *m_policy.space().impl_internal_space_instance(); const std::size_t len = m_policy.end() - m_policy.begin(); - // Compute the total amount of memory we will need. We emulate the recursive - // structure that is used to do the actual scan. Essentially, we need to - // allocate memory for the whole range and then recursively for the reduced - // group results until only one group is left. - std::size_t total_memory = 0; - { - size_t wgroup_size = 128; - size_t n_nested_size = len; - size_t n_nested_wgroups; - do { - n_nested_wgroups = (n_nested_size + wgroup_size - 1) / wgroup_size; - n_nested_size = n_nested_wgroups; - total_memory += sizeof(value_type) * n_nested_wgroups * wgroup_size; - } while (n_nested_wgroups > 1); - total_memory += sizeof(value_type) * wgroup_size; - } + // Compute the total amount of memory we will need. + // We need to allocate memory for the whole range (rounded towards the next + // multiple of the wqorkgroup size) and for one element per workgroup that + // will contain the sum of the previous workgroups totals. + size_t wgroup_size = 128; + size_t n_wgroups = (len + wgroup_size - 1) / wgroup_size; + size_t total_memory = n_wgroups * (wgroup_size + 1) * sizeof(value_type); // FIXME_SYCL consider only storing one value per block and recreate initial // results in the end before doing the final pass From bdaa12cb919961ed7f0be7d496e535a34b9ec6c4 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 19 Apr 2023 15:26:34 +0000 Subject: [PATCH 390/496] Compiling with auto deduction of workgroup sizes --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 8 +- core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp | 132 +++++++++++++------- 2 files changed, 87 insertions(+), 53 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 7f33a4f948..472baed744 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -280,11 +280,9 @@ sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { m_scratchFlags = reinterpret_cast(r->data()); } - m_queue->memset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain); - fence(*m_queue, - "Kokkos::Experimental::SYCLInternal::scratch_flags fence after " - "initializing m_scratchFlags", - m_instance_id); + auto memset_event = m_queue->memset(m_scratchFlags, 0, + m_scratchFlagsCount * sizeScratchGrain); + m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); return m_scratchFlags; } diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp index 6e8b67c011..237ba47be3 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp @@ -124,7 +124,7 @@ class ParallelScanSYCLBase { private: template sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, - sycl::event memcpy_event) const { + sycl::event memcpy_event) { // Convenience references const Kokkos::Experimental::SYCL& space = m_policy.space(); Kokkos::Experimental::Impl::SYCLInternal& instance = @@ -133,35 +133,21 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); - // FIXME_SYCL optimize - constexpr size_t wgroup_size = 128; - auto n_wgroups = (size + wgroup_size - 1) / wgroup_size; - auto global_mem = m_scratch_space; - pointer_type group_results = global_mem + n_wgroups * wgroup_size; - auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); - // Initialize global memory - auto initialize_global_memory = q.submit([&](sycl::handler& cgh) { - auto begin = m_policy.begin(); + const auto begin = m_policy.begin(); - // Store subgroup totals - const auto min_subgroup_size = - q.get_device() - .template get_info() - .front(); - sycl::local_accessor local_mem( - sycl::range<1>((wgroup_size + min_subgroup_size - 1) / - min_subgroup_size), - cgh); - sycl::local_accessor num_teams_done(1, cgh); - - cgh.depends_on(memcpy_event); + // Initialize global memory + auto scan_lambda_factory = + [&](sycl::local_accessor local_mem, + sycl::local_accessor num_teams_done, + sycl::device_ptr global_mem_, + sycl::device_ptr group_results_) { + auto lambda = [=](sycl::nd_item<1> item) { + auto global_mem = global_mem_; + auto group_results = group_results_; - cgh.parallel_for( - sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), - [=](sycl::nd_item<1> item) { const CombinedFunctorReducer< FunctorType, typename Analysis::Reducer>& functor_reducer = functor_wrapper.get_functor(); @@ -169,7 +155,10 @@ class ParallelScanSYCLBase { const typename Analysis::Reducer& reducer = functor_reducer.get_reducer(); - const index_type local_id = item.get_local_linear_id(); + const auto n_wgroups = item.get_group_range()[0]; + const int wgroup_size = item.get_local_range()[0]; + + const int local_id = item.get_local_linear_id(); const index_type global_id = item.get_global_linear_id(); // Initialize local memory @@ -210,8 +199,9 @@ class ParallelScanSYCLBase { local_value = group_results[id]; else reducer.init(&local_value); - workgroup_scan<>(item, reducer, local_mem, local_value, - std::min(n_wgroups - offset, wgroup_size)); + workgroup_scan<>( + item, reducer, local_mem, local_value, + std::min(n_wgroups - offset, wgroup_size)); if (id < static_cast(n_wgroups)) { reducer.join(&local_value, &total); group_results[id] = local_value; @@ -223,23 +213,83 @@ class ParallelScanSYCLBase { item.barrier(sycl::access::fence_space::global_space); } } - }); + }; + return lambda; + }; + + size_t wgroup_size; + size_t n_wgroups; + sycl::device_ptr global_mem; + sycl::device_ptr group_results; + + auto perform_work_group_scans = q.submit([&](sycl::handler& cgh) { + sycl::local_accessor num_teams_done(1, cgh); + + auto dummy_scan_lambda = + scan_lambda_factory({1, cgh}, num_teams_done, nullptr, nullptr); + + static sycl::kernel kernel = [&] { + sycl::kernel_id functor_kernel_id = + sycl::get_kernel_id(); + auto kernel_bundle = + sycl::get_kernel_bundle( + q.get_context(), std::vector{functor_kernel_id}); + return kernel_bundle.get_kernel(functor_kernel_id); + }(); + auto multiple = kernel.get_info( + q.get_device()); + auto max = + kernel.get_info( + q.get_device()); + + wgroup_size = static_cast(max / multiple) * multiple; + n_wgroups = (size + wgroup_size - 1) / wgroup_size; + + // Compute the total amount of memory we will need. + // We need to allocate memory for the whole range (rounded towards the + // next multiple of the workgroup size) and for one element per workgroup + // that will contain the sum of the previous workgroups totals. + // FIXME_SYCL consider only storing one value per block and recreate + // initial results in the end before doing the final pass + global_mem = + static_cast>(instance.scratch_space( + n_wgroups * (wgroup_size + 1) * sizeof(value_type))); + m_scratch_space = global_mem; + + group_results = global_mem + n_wgroups * wgroup_size; + + // Store subgroup totals in local space + const auto min_subgroup_size = + q.get_device() + .template get_info() + .front(); + sycl::local_accessor local_mem( + sycl::range<1>((wgroup_size + min_subgroup_size - 1) / + min_subgroup_size), + cgh); + + cgh.depends_on(memcpy_event); + + auto scan_lambda = scan_lambda_factory(local_mem, num_teams_done, + global_mem, group_results); + cgh.parallel_for(sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), + scan_lambda); }); // Write results to global memory auto update_global_results = q.submit([&](sycl::handler& cgh) { - auto global_mem = m_scratch_space; auto result_ptr_device_accessible = m_result_ptr_device_accessible; // The compiler failed with CL_INVALID_ARG_VALUE if using m_result_ptr // directly. auto result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr; - auto begin = m_policy.begin(); - cgh.depends_on(initialize_global_memory); + cgh.depends_on(perform_work_group_scans); cgh.parallel_for( sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), [=](sycl::nd_item<1> item) { + auto global_mem_copy = global_mem; const index_type global_id = item.get_global_linear_id(); const CombinedFunctorReducer< FunctorType, typename Analysis::Reducer>& functor_reducer = @@ -258,7 +308,7 @@ class ParallelScanSYCLBase { else functor(WorkTag(), global_id + begin, update, true); - global_mem[global_id] = update; + global_mem_copy[global_id] = update; if (global_id == size - 1 && result_ptr_device_accessible) *result_ptr = update; } @@ -274,21 +324,7 @@ class ParallelScanSYCLBase { void impl_execute(const PostFunctor& post_functor) { if (m_policy.begin() == m_policy.end()) return; - auto& instance = *m_policy.space().impl_internal_space_instance(); - const std::size_t len = m_policy.end() - m_policy.begin(); - - // Compute the total amount of memory we will need. - // We need to allocate memory for the whole range (rounded towards the next - // multiple of the wqorkgroup size) and for one element per workgroup that - // will contain the sum of the previous workgroups totals. - size_t wgroup_size = 128; - size_t n_wgroups = (len + wgroup_size - 1) / wgroup_size; - size_t total_memory = n_wgroups * (wgroup_size + 1) * sizeof(value_type); - - // FIXME_SYCL consider only storing one value per block and recreate initial - // results in the end before doing the final pass - m_scratch_space = static_cast>( - instance.scratch_space(total_memory)); + auto& instance = *m_policy.space().impl_internal_space_instance(); Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); From 56ef02c0cd45b62c39b5a789488467331822499d Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Mon, 1 May 2023 09:20:26 -0400 Subject: [PATCH 391/496] Disable failed bit manipulation tests when compiled by NVHPC (#6088) * NVHPC compiler failed to correctly translate accelerator region for tests in core/unit_test/TestBitManipulationBuiltins.hpp if the tested variable type is char or short; disable those failed tests. * Update so that the problematic bit manipulation tests are disabled only when NVHPC compiles for the OpenACC backend. * Fixed the missing indentation error regarding the SYCL tests --- .../unit_test/TestBitManipulationBuiltins.hpp | 152 ++++++++++++++---- 1 file changed, 124 insertions(+), 28 deletions(-) diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp index bc2122e0ab..3b5e7a3db5 100644 --- a/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/core/unit_test/TestBitManipulationBuiltins.hpp @@ -118,8 +118,16 @@ void test_bit_manip_countl_zero() { } TEST(TEST_CATEGORY, bit_manip_countl_zero) { - test_bit_manip_countl_zero(); - test_bit_manip_countl_zero(); +// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + if constexpr (!std::is_same_v) { +#endif + test_bit_manip_countl_zero(); + test_bit_manip_countl_zero(); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + } +#endif test_bit_manip_countl_zero(); test_bit_manip_countl_zero(); test_bit_manip_countl_zero(); @@ -170,8 +178,16 @@ void test_bit_manip_countl_one() { } TEST(TEST_CATEGORY, bit_manip_countl_one) { - test_bit_manip_countl_one(); - test_bit_manip_countl_one(); +// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + if constexpr (!std::is_same_v) { +#endif + test_bit_manip_countl_one(); + test_bit_manip_countl_one(); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + } +#endif test_bit_manip_countl_one(); test_bit_manip_countl_one(); test_bit_manip_countl_one(); @@ -205,12 +221,20 @@ void test_bit_manip_countr_zero() { } TEST(TEST_CATEGORY, bit_manip_countr_zero) { +// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + if constexpr (!std::is_same_v) { +#endif #if defined(KOKKOS_ENABLE_SYCL) && \ !defined(KOKKOS_ARCH_INTEL_GPU) // FIXME_SYCL returns wrong result - if (!std::is_same_v) + if (!std::is_same_v) +#endif + test_bit_manip_countr_zero(); + test_bit_manip_countr_zero(); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + } #endif - test_bit_manip_countr_zero(); - test_bit_manip_countr_zero(); test_bit_manip_countr_zero(); test_bit_manip_countr_zero(); test_bit_manip_countr_zero(); @@ -243,12 +267,20 @@ void test_bit_manip_countr_one() { } TEST(TEST_CATEGORY, bit_manip_countr_one) { +// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + if constexpr (!std::is_same_v) { +#endif #if defined(KOKKOS_ENABLE_SYCL) && \ !defined(KOKKOS_ARCH_INTEL_GPU) // FIXME_SYCL returns wrong result - if (!std::is_same_v) + if (!std::is_same_v) +#endif + test_bit_manip_countr_one(); + test_bit_manip_countr_one(); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + } #endif - test_bit_manip_countr_one(); - test_bit_manip_countr_one(); test_bit_manip_countr_one(); test_bit_manip_countr_one(); test_bit_manip_countr_one(); @@ -279,8 +311,16 @@ void test_bit_manip_popcount() { } TEST(TEST_CATEGORY, bit_manip_popcount) { - test_bit_manip_popcount(); - test_bit_manip_popcount(); +// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + if constexpr (!std::is_same_v) { +#endif + test_bit_manip_popcount(); + test_bit_manip_popcount(); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + } +#endif test_bit_manip_popcount(); test_bit_manip_popcount(); test_bit_manip_popcount(); @@ -320,8 +360,16 @@ void test_bit_manip_has_single_bit() { } TEST(TEST_CATEGORY, bit_manip_has_single_bit) { - test_bit_manip_has_single_bit(); - test_bit_manip_has_single_bit(); +// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + if constexpr (!std::is_same_v) { +#endif + test_bit_manip_has_single_bit(); + test_bit_manip_has_single_bit(); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + } +#endif test_bit_manip_has_single_bit(); test_bit_manip_has_single_bit(); test_bit_manip_has_single_bit(); @@ -355,8 +403,16 @@ void test_bit_manip_bit_floor() { } TEST(TEST_CATEGORY, bit_manip_bit_floor) { - test_bit_manip_bit_floor(); - test_bit_manip_bit_floor(); +// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + if constexpr (!std::is_same_v) { +#endif + test_bit_manip_bit_floor(); + test_bit_manip_bit_floor(); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + } +#endif test_bit_manip_bit_floor(); test_bit_manip_bit_floor(); test_bit_manip_bit_floor(); @@ -395,8 +451,16 @@ void test_bit_manip_bit_ceil() { } TEST(TEST_CATEGORY, bit_manip_bit_ceil) { - test_bit_manip_bit_ceil(); - test_bit_manip_bit_ceil(); +// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + if constexpr (!std::is_same_v) { +#endif + test_bit_manip_bit_ceil(); + test_bit_manip_bit_ceil(); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + } +#endif test_bit_manip_bit_ceil(); test_bit_manip_bit_ceil(); test_bit_manip_bit_ceil(); @@ -426,8 +490,16 @@ void test_bit_manip_bit_width() { } TEST(TEST_CATEGORY, bit_manip_bit_width) { - test_bit_manip_bit_width(); - test_bit_manip_bit_width(); +// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + if constexpr (!std::is_same_v) { +#endif + test_bit_manip_bit_width(); + test_bit_manip_bit_width(); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + } +#endif test_bit_manip_bit_width(); test_bit_manip_bit_width(); test_bit_manip_bit_width(); @@ -541,8 +613,16 @@ void test_bit_manip_rotl() { } TEST(TEST_CATEGORY, bit_manip_rotl) { - test_bit_manip_rotl(); - test_bit_manip_rotl(); +// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + if constexpr (!std::is_same_v) { +#endif + test_bit_manip_rotl(); + test_bit_manip_rotl(); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + } +#endif test_bit_manip_rotl(); test_bit_manip_rotl(); test_bit_manip_rotl(); @@ -595,8 +675,16 @@ void test_bit_manip_rotr() { } TEST(TEST_CATEGORY, bit_manip_rotr) { - test_bit_manip_rotr(); - test_bit_manip_rotr(); +// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + if constexpr (!std::is_same_v) { +#endif + test_bit_manip_rotr(); + test_bit_manip_rotr(); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + } +#endif test_bit_manip_rotr(); test_bit_manip_rotr(); test_bit_manip_rotr(); @@ -658,10 +746,18 @@ void test_bit_manip_byteswap() { } TEST(TEST_CATEGORY, bit_manip_byeswap) { - test_bit_manip_byteswap(); - test_bit_manip_byteswap(); - test_bit_manip_byteswap(); - test_bit_manip_byteswap(); +// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + if constexpr (!std::is_same_v) { +#endif + test_bit_manip_byteswap(); + test_bit_manip_byteswap(); + test_bit_manip_byteswap(); + test_bit_manip_byteswap(); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + } +#endif test_bit_manip_byteswap(); test_bit_manip_byteswap(); test_bit_manip_byteswap(); From e5490e1e11ece175ec16588c5a3062ece2fe1b33 Mon Sep 17 00:00:00 2001 From: Sergey Fedorov Date: Wed, 3 May 2023 06:42:43 +0800 Subject: [PATCH 392/496] Add support for Darwin 32-bit and PPC (#5916) * Do not use 64-bit static_asserts on 32-bit build * TestScan: case for 32-bit * Disable team scratch failing test on 32-bit, for now * CMakeLists: status message instead of a fatal error for 32-bit build * Kokkos_ClockTic: add implementation for ppc * Use KOKKOS_IMPL_32BIT instead of KOKKOS_32_BIT * Add 32bit build to GitHub CI * Limit ScatterView test to 1GB * Fix signed compariosn in TestVector.hpp * Indentation * Signed compare * Disable std::complex test * Disable repeated_team_reduce and deep_copy_conversion * Update test restriction Co-authored-by: Damien L-G * Update core/unit_test/TestTeamBasic.hpp * Add assert for sizeof(void*) * Try using integer comparison * Add reference for clock_tic_host implementation * Make FIXME_32BIT more uniform * Print platform information * Fix Kokkos_Core_fwd.hpp * Separate 32bit CI into its own workflow * Add new workflow file * Minimize workflow file * Try LIBDL=ON * Remove ccache commands and LIBDL --------- Co-authored-by: Daniel Arndt Co-authored-by: Damien L-G --- .../continuous-integration-workflow-32bit.yml | 37 +++++++++++++++++++ CMakeLists.txt | 7 +++- cmake/KokkosCore_config.h.in | 2 + containers/unit_tests/TestScatterView.hpp | 4 +- containers/unit_tests/TestVector.hpp | 4 +- core/src/Kokkos_Core_fwd.hpp | 10 +++-- core/src/impl/Kokkos_ClockTic.hpp | 21 +++++++++++ core/src/impl/Kokkos_Core.cpp | 6 +++ core/src/impl/Kokkos_StringManipulation.hpp | 2 +- core/src/impl/Kokkos_TaskBase.hpp | 3 +- core/unit_test/TestComplex.hpp | 4 ++ core/unit_test/TestDeepCopyAlignment.hpp | 3 ++ core/unit_test/TestTeamBasic.hpp | 3 ++ core/unit_test/TestTeamReductionScan.hpp | 4 ++ 14 files changed, 100 insertions(+), 10 deletions(-) create mode 100644 .github/workflows/continuous-integration-workflow-32bit.yml diff --git a/.github/workflows/continuous-integration-workflow-32bit.yml b/.github/workflows/continuous-integration-workflow-32bit.yml new file mode 100644 index 0000000000..7fab3b0e62 --- /dev/null +++ b/.github/workflows/continuous-integration-workflow-32bit.yml @@ -0,0 +1,37 @@ +name: github-Linux-32bit +on: [push, pull_request] + +concurrency: + group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{github.event_name == 'pull_request'}} + +jobs: + CI-32bit: + name: Linux-32bit + runs-on: ubuntu-latest + container: + image: ghcr.io/kokkos/ci-containers/ubuntu:latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: install_multilib + run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib gfortran-multilib + - name: Configure Kokkos + run: | + cmake -B builddir \ + -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_EXAMPLES=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DCMAKE_CXX_FLAGS="-Werror -m32 -DKOKKOS_IMPL_32BIT" \ + -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo + - name: Build + run: | + cmake --build builddir --parallel 2 + - name: Tests + working-directory: builddir + run: ctest --output-on-failure diff --git a/CMakeLists.txt b/CMakeLists.txt index c82910708a..e1b77a2c09 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -138,7 +138,12 @@ IF (NOT CMAKE_SIZEOF_VOID_P) MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation") ENDIF() ELSEIF (NOT CMAKE_SIZEOF_VOID_P EQUAL 8) - MESSAGE(FATAL_ERROR "Kokkos assumes a 64-bit build; i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead") + IF(CMAKE_SIZEOF_VOID_P EQUAL 4) + MESSAGE(WARNING "32-bit builds are experimental and not officially supported.") + SET(KOKKOS_IMPL_32BIT ON) + ELSE() + MESSAGE(FATAL_ERROR "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;") + ENDIF() ENDIF() diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index d325443f3d..bcfa16d742 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -119,3 +119,5 @@ #cmakedefine KOKKOS_ARCH_NAVI #cmakedefine KOKKOS_ARCH_NAVI1030 #cmakedefine KOKKOS_ARCH_NAVI1100 + +#cmakedefine KOKKOS_IMPL_32BIT diff --git a/containers/unit_tests/TestScatterView.hpp b/containers/unit_tests/TestScatterView.hpp index c9ad65cc2b..733f43122c 100644 --- a/containers/unit_tests/TestScatterView.hpp +++ b/containers/unit_tests/TestScatterView.hpp @@ -726,9 +726,9 @@ void test_scatter_view(int64_t n) { } #endif // with hundreds of threads we were running out of memory. - // limit (n) so that duplication doesn't exceed 4GB + // limit (n) so that duplication doesn't exceed 1GB constexpr std::size_t maximum_allowed_total_bytes = - 4ull * 1024ull * 1024ull * 1024ull; + 1ull * 1024ull * 1024ull * 1024ull; std::size_t const maximum_allowed_copy_bytes = maximum_allowed_total_bytes / std::size_t(execution_space().concurrency()); diff --git a/containers/unit_tests/TestVector.hpp b/containers/unit_tests/TestVector.hpp index a22066f753..a7d341b789 100644 --- a/containers/unit_tests/TestVector.hpp +++ b/containers/unit_tests/TestVector.hpp @@ -52,7 +52,7 @@ struct test_vector_insert { it_return = a.insert(it, n + 5, scalar_type(5)); ASSERT_EQ(a.size(), n + 1 + n + 5); - ASSERT_EQ(std::distance(it_return, a.begin() + 17), 0u); + ASSERT_EQ(std::distance(it_return, a.begin() + 17), 0); Vector b; @@ -65,7 +65,7 @@ struct test_vector_insert { it_return = a.insert(it, b.begin(), b.end()); ASSERT_EQ(a.size(), n + 1 + n + 5 + 7); - ASSERT_EQ(std::distance(it_return, a.begin() + 27 + n), 0u); + ASSERT_EQ(std::distance(it_return, a.begin() + 27 + n), 0); // Testing insert at end via all three function interfaces a.insert(a.end(), 11); diff --git a/core/src/Kokkos_Core_fwd.hpp b/core/src/Kokkos_Core_fwd.hpp index 6546c875ed..883807f9d2 100644 --- a/core/src/Kokkos_Core_fwd.hpp +++ b/core/src/Kokkos_Core_fwd.hpp @@ -34,11 +34,15 @@ #endif //---------------------------------------------------------------------------- -// Have assumed a 64bit build (8byte pointers) throughout the code base. - +// Have assumed a 64-bit build (8-byte pointers) throughout the code base. +// 32-bit build allowed but unsupported. +#ifdef KOKKOS_IMPL_32BIT +static_assert(sizeof(void *) == 4, + "Kokkos assumes 64-bit build; i.e., 4-byte pointers"); +#else static_assert(sizeof(void *) == 8, "Kokkos assumes 64-bit build; i.e., 8-byte pointers"); - +#endif //---------------------------------------------------------------------------- namespace Kokkos { diff --git a/core/src/impl/Kokkos_ClockTic.hpp b/core/src/impl/Kokkos_ClockTic.hpp index 9e8c70076c..6e3d99ebd6 100644 --- a/core/src/impl/Kokkos_ClockTic.hpp +++ b/core/src/impl/Kokkos_ClockTic.hpp @@ -90,6 +90,27 @@ KOKKOS_IMPL_HOST_FUNCTION inline uint64_t clock_tic_host() noexcept { return (uint64_t)cycles; +#elif defined(__ppc__) + // see : pages.cs.wisc.edu/~legault/miniproj-736.pdf or + // cmssdt.cern.ch/lxr/source/FWCore/Utilities/interface/HRRealTime.h + + uint64_t result = 0; + uint32_t upper, lower, tmp; + + __asm__ volatile( + "0: \n" + "\tmftbu %0 \n" + "\tmftb %1 \n" + "\tmftbu %2 \n" + "\tcmpw %2, %0 \n" + "\tbne 0b \n" + : "=r"(upper), "=r"(lower), "=r"(tmp)); + result = upper; + result = result << 32; + result = result | lower; + + return (result); + #else return std::chrono::high_resolution_clock::now().time_since_epoch().count(); diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 7b5b74369c..0278490d4b 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -772,6 +772,12 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #else declare_configuration_metadata("architecture", "GPU architecture", "none"); #endif + +#ifdef KOKKOS_IMPL_32BIT + declare_configuration_metadata("architecture", "platform", "32bit"); +#else + declare_configuration_metadata("architecture", "platform", "64bit"); +#endif } void post_initialize_internal(const Kokkos::InitializationSettings& settings) { diff --git a/core/src/impl/Kokkos_StringManipulation.hpp b/core/src/impl/Kokkos_StringManipulation.hpp index c7baf6d0c3..231cc2c39c 100644 --- a/core/src/impl/Kokkos_StringManipulation.hpp +++ b/core/src/impl/Kokkos_StringManipulation.hpp @@ -173,7 +173,7 @@ KOKKOS_FUNCTION constexpr to_chars_result to_chars_i(char *first, char *last, unsigned_val = Unsigned(~value) + Unsigned(1); } } - unsigned int const len = to_chars_len(unsigned_val); + std::ptrdiff_t const len = to_chars_len(unsigned_val); if (last - first < len) { return {last, errc::value_too_large}; } diff --git a/core/src/impl/Kokkos_TaskBase.hpp b/core/src/impl/Kokkos_TaskBase.hpp index 1c4c158217..ed548e99a8 100644 --- a/core/src/impl/Kokkos_TaskBase.hpp +++ b/core/src/impl/Kokkos_TaskBase.hpp @@ -206,6 +206,7 @@ class TaskBase { // the number of full task types that fit into a cache line. We'll leave it // here for now, though, since we're probably going to be ripping all of the // old TaskBase stuff out eventually anyway. +#ifndef KOKKOS_IMPL_32BIT constexpr size_t unpadded_task_base_size = 44 + 2 * sizeof(int16_t); // don't forget padding: constexpr size_t task_base_misalignment = @@ -229,7 +230,7 @@ static constexpr static_assert(sizeof(TaskBase) == expected_task_base_size, "Verifying expected sizeof(TaskBase)"); - +#endif // end Verify the size of TaskBase is as expected }}}2 //------------------------------------------------------------------------------ diff --git a/core/unit_test/TestComplex.hpp b/core/unit_test/TestComplex.hpp index 87085f3648..bcae2e1d81 100644 --- a/core/unit_test/TestComplex.hpp +++ b/core/unit_test/TestComplex.hpp @@ -519,9 +519,13 @@ TEST(TEST_CATEGORY, complex_operations_arithmetic_types_overloads) { ASSERT_EQ(Kokkos::conj(1), Kokkos::complex(1)); ASSERT_EQ(Kokkos::conj(2.f), Kokkos::complex(2.f)); ASSERT_EQ(Kokkos::conj(3.), Kokkos::complex(3.)); +// long double has size 12 but Kokkos::complex requires 2*sizeof(T) to be a +// power of two. +#ifndef KOKKOS_IMPL_32BIT ASSERT_EQ(Kokkos::conj(4.l), Kokkos::complex(4.l)); static_assert(( std::is_same>::value)); +#endif static_assert((std::is_same>::value)); static_assert((std::is_same(); } From 4846d47f57226750591c5a9cdc348f01784f4bf5 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Tue, 7 Mar 2023 12:06:29 -0800 Subject: [PATCH 393/496] Unconditionally enable CUDA extended lambda support --- cmake/kokkos_arch.cmake | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 2ed5d1c610..707125a673 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -162,11 +162,9 @@ ENDIF() #clear anything that might be in the cache GLOBAL_SET(KOKKOS_CUDA_OPTIONS) # Construct the Makefile options -IF (KOKKOS_ENABLE_CUDA_LAMBDA) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-extended-lambda") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") - ENDIF() +IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-extended-lambda") + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") ENDIF() IF (KOKKOS_ENABLE_CUDA_CONSTEXPR) From a906356caa47bc8120bf34338c2eb84aa8559211 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Tue, 7 Mar 2023 12:10:03 -0800 Subject: [PATCH 394/496] Tentative arguments switch for nvcc 12+ --- cmake/kokkos_arch.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 707125a673..1be6453b07 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -163,7 +163,12 @@ ENDIF() GLOBAL_SET(KOKKOS_CUDA_OPTIONS) # Construct the Makefile options IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-extended-lambda") + # Extended lambda support was stabilized in nvcc 12 + IF(KOKKOS_COMPILER_VERSION_MAJOR EQUAL 11) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-extended-lambda") + ELSE() + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-extended-lambda") + ENDIF() GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") ENDIF() From ca9fd21788cd940dffd9dabaaf9c593575aa7e5e Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Tue, 7 Mar 2023 12:22:51 -0800 Subject: [PATCH 395/496] Change Makefile.kokkos too --- Makefile.kokkos | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 1234f4cc9e..23bd8288fc 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -665,15 +665,13 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1) - ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") - KOKKOS_CXXFLAGS += -expt-extended-lambda - endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") + KOKKOS_CXXFLAGS += -expt-extended-lambda + endif - ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") - endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") endif ifeq ($(KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR), 1) From ddded0eb2837dea82c7aae7d1869ce46750a2644 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Wed, 29 Mar 2023 14:24:37 -0700 Subject: [PATCH 396/496] Implement CMake messages per team decision --- cmake/kokkos_arch.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 1be6453b07..2448c43f10 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -172,6 +172,14 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") ENDIF() +IF(DEFINED KOKKOS_ENABLE_CUDA_LAMBDA) + IF(KOKKOS_ENABLE_CUDA_LAMBDA) + MESSAGE(DEPRECATION "CUDA extended lambda support is now always enabled. The option Kokkos_ENABLE_CUDA_LAMBDA will be removed") + ELSE() + MESSAGE(FATAL_ERROR "Support for disabling CUDA extended lambdas has been removed. Please unset Kokkos_ENABLE_CUDA_LAMBDA, or see #5964 if this is necessary for your application") + ENDIF() +ENDIF() + IF (KOKKOS_ENABLE_CUDA_CONSTEXPR) IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") From 44702849197960d73c3c4481fb0709537ddc8c99 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Wed, 5 Apr 2023 10:54:14 -0700 Subject: [PATCH 397/496] Fix definitions and docs to remove CUDA Lambda option --- .jenkins | 4 ---- cmake/KokkosCore_config.h.in | 1 - cmake/kokkos_arch.cmake | 4 ++-- cmake/kokkos_enable_options.cmake | 10 +--------- core/src/Kokkos_Macros.hpp | 5 +++++ generate_makefile.bash | 6 ++---- scripts/trilinos-integration/waterman_cuda_env.sh | 2 +- scripts/trilinos-integration/white_cuda_env.sh | 2 +- 8 files changed, 12 insertions(+), 22 deletions(-) diff --git a/.jenkins b/.jenkins index b7591d1a23..a4d2f7cefc 100644 --- a/.jenkins +++ b/.jenkins @@ -84,7 +84,6 @@ pipeline { -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_OPENMP=ON \ .. && \ make -j8 && ctest --verbose''' @@ -313,7 +312,6 @@ pipeline { -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_TUNING=ON \ -DKokkos_ARCH_VOLTA70=ON \ .. && \ @@ -386,7 +384,6 @@ pipeline { -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_OPENMP=OFF \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ -DKokkos_ENABLE_CUDA_UVM=ON \ -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ @@ -453,7 +450,6 @@ pipeline { -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_LIBDL=OFF \ .. && \ make -j8 && ctest --verbose && \ diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index bcfa16d742..2b1ea092e0 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -35,7 +35,6 @@ #cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_CUDA_UVM -#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 2448c43f10..2137344c5e 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -172,8 +172,8 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") ENDIF() -IF(DEFINED KOKKOS_ENABLE_CUDA_LAMBDA) - IF(KOKKOS_ENABLE_CUDA_LAMBDA) +IF(DEFINED Kokkos_ENABLE_CUDA_LAMBDA) + IF(Kokkos_ENABLE_CUDA_LAMBDA) MESSAGE(DEPRECATION "CUDA extended lambda support is now always enabled. The option Kokkos_ENABLE_CUDA_LAMBDA will be removed") ELSE() MESSAGE(FATAL_ERROR "Support for disabling CUDA extended lambdas has been removed. Please unset Kokkos_ENABLE_CUDA_LAMBDA, or see #5964 if this is necessary for your application") diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index 7d8026989a..145815c7f2 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -67,14 +67,6 @@ mark_as_advanced(Kokkos_ENABLE_IMPL_MDSPAN) mark_as_advanced(Kokkos_ENABLE_MDSPAN_EXTERNAL) mark_as_advanced(Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) -IF (Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSEIF (KOKKOS_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSE() - SET(CUDA_LAMBDA_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to activate experimental lambda features") IF (Trilinos_ENABLE_Kokkos) SET(COMPLEX_ALIGN_DEFAULT OFF) ELSE() @@ -123,7 +115,7 @@ FUNCTION(check_device_specific_options) ENDIF() ENDFUNCTION() -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC) +CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_CONSTEXPR CUDA_LDG_INTRINSIC) CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE) CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 8cc4a6efa3..f765efe68c 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -607,4 +607,9 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION #endif +// This was previously defined from the configuration option which was removed +#if defined(KOKKOS_ENABLE_CUDA) +#define KOKKOS_ENABLE_CUDA_LAMBDA +#endif + #endif // #ifndef KOKKOS_MACROS_HPP diff --git a/generate_makefile.bash b/generate_makefile.bash index 018426c9b8..47321c5e14 100755 --- a/generate_makefile.bash +++ b/generate_makefile.bash @@ -64,9 +64,7 @@ get_kokkos_cuda_option_list() { for CUDA_ in $PARSE_CUDA_LST do CUDA_OPT_NAME= - if [ "${CUDA_}" == "enable_lambda" ]; then - CUDA_OPT_NAME=CUDA_LAMBDA - elif [ "${CUDA_}" == "rdc" ]; then + if [ "${CUDA_}" == "rdc" ]; then CUDA_OPT_NAME=CUDA_RELOCATABLE_DEVICE_CODE elif [ "${CUDA_}" == "force_uvm" ]; then CUDA_OPT_NAME=CUDA_UVM @@ -231,7 +229,7 @@ display_help_text() { echo " disable_profiling = do not compile with profiling hooks" echo " " echo "--with-cuda-options=[OPT]: Additional options to CUDA:" - echo " force_uvm, use_ldg, enable_lambda, rdc" + echo " force_uvm, use_ldg, rdc" echo "--with-hip-options=[OPT]: Additional options to HIP:" echo " rdc" echo "--with-hpx-options=[OPT]: Additional options to HPX:" diff --git a/scripts/trilinos-integration/waterman_cuda_env.sh b/scripts/trilinos-integration/waterman_cuda_env.sh index 445b4f9697..0301eb0717 100755 --- a/scripts/trilinos-integration/waterman_cuda_env.sh +++ b/scripts/trilinos-integration/waterman_cuda_env.sh @@ -30,7 +30,7 @@ export CUDA_LAUNCH_BLOCKING=1 export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 -export KOKKOS_EXTRA_FLAGS="-DKokkos_ENABLE_CUDA_LAMBDA=ON" +export KOKKOS_EXTRA_FLAGS="" scriptdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" echo "DIR=$scriptdir" NVCC_WRAPPER=`realpath $scriptdir/../../bin/nvcc_wrapper` diff --git a/scripts/trilinos-integration/white_cuda_env.sh b/scripts/trilinos-integration/white_cuda_env.sh index f3745ede8c..4f6f4bcd9a 100755 --- a/scripts/trilinos-integration/white_cuda_env.sh +++ b/scripts/trilinos-integration/white_cuda_env.sh @@ -31,7 +31,7 @@ export CUDA_LAUNCH_BLOCKING=1 export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 -export KOKKOS_EXTRA_FLAGS="-DKokkos_ENABLE_CUDA_LAMBDA=ON" +export KOKKOS_EXTRA_FLAGS="" scriptdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" NVCC_WRAPPER=`realpath $scriptdir/../../bin/nvcc_wrapper` export OMPI_CXX=$NVCC_WRAPPER From 51d7c720c658650b22e66ff96c0ffc7a5f95575e Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Wed, 5 Apr 2023 11:23:19 -0700 Subject: [PATCH 398/496] Don't fail to define broader 'lambdas are available' macro --- core/src/Kokkos_Macros.hpp | 7 ++----- core/src/setup/Kokkos_Setup_Cuda.hpp | 6 ------ 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index f765efe68c..0c830007c1 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -547,6 +547,8 @@ static constexpr bool kokkos_omp_on_host() { return false; } #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) #define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC +// This was previously defined from the configuration option which was removed +#define KOKKOS_ENABLE_CUDA_LAMBDA #endif #define KOKKOS_INVALID_INDEX (~std::size_t(0)) @@ -607,9 +609,4 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION #endif -// This was previously defined from the configuration option which was removed -#if defined(KOKKOS_ENABLE_CUDA) -#define KOKKOS_ENABLE_CUDA_LAMBDA -#endif - #endif // #ifndef KOKKOS_MACROS_HPP diff --git a/core/src/setup/Kokkos_Setup_Cuda.hpp b/core/src/setup/Kokkos_Setup_Cuda.hpp index c57f690ae1..1130485e84 100644 --- a/core/src/setup/Kokkos_Setup_Cuda.hpp +++ b/core/src/setup/Kokkos_Setup_Cuda.hpp @@ -53,15 +53,9 @@ #error "Cuda device capability >= 3.0 is required." #endif -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA #define KOKKOS_LAMBDA [=] __host__ __device__ - #define KOKKOS_CLASS_LAMBDA [ =, *this ] __host__ __device__ -#else // !defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA -#endif // !defined(KOKKOS_ENABLE_CUDA_LAMBDA) - #define KOKKOS_IMPL_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__ #define KOKKOS_IMPL_FORCEINLINE __forceinline__ #define KOKKOS_IMPL_INLINE_FUNCTION __device__ __host__ inline From 7e329998e142d2c9b842124872835fe0287d98f9 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Wed, 5 Apr 2023 12:13:34 -0700 Subject: [PATCH 399/496] Always expect KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA to be set --- core/unit_test/TestCompilerMacros.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/core/unit_test/TestCompilerMacros.cpp b/core/unit_test/TestCompilerMacros.cpp index b77368037e..5927d142de 100644 --- a/core/unit_test/TestCompilerMacros.cpp +++ b/core/unit_test/TestCompilerMacros.cpp @@ -28,15 +28,9 @@ #error "Only one host compiler macro can be defined" #endif -#if defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) -#error "Macro bug: KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA shouldn't be defined" -#endif -#else #if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #error "Macro bug: KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA should be defined" #endif -#endif namespace TestCompilerMacros { From 4407f7b2e9d06e98b20f1eae11a601a357ee9f23 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Fri, 21 Apr 2023 11:46:25 -0600 Subject: [PATCH 400/496] Remove various test exclusions based on KOKKOS_ENABLE_CUDA_LAMBDA --- containers/unit_tests/TestErrorReporter.hpp | 2 -- containers/unit_tests/TestOffsetView.hpp | 14 -------------- core/perf_test/CMakeLists.txt | 2 -- core/perf_test/PerfTest_ViewAllocate.cpp | 2 -- core/perf_test/PerfTest_ViewCopy_Raw.cpp | 2 -- core/perf_test/PerfTest_ViewFill_Raw.cpp | 2 -- core/perf_test/PerfTest_ViewResize_Raw.cpp | 2 -- core/unit_test/TestMDRangeReduce.hpp | 2 -- core/unit_test/TestTeamMDRange.hpp | 6 ------ 9 files changed, 34 deletions(-) diff --git a/containers/unit_tests/TestErrorReporter.hpp b/containers/unit_tests/TestErrorReporter.hpp index 0003a29468..7d7765cf8c 100644 --- a/containers/unit_tests/TestErrorReporter.hpp +++ b/containers/unit_tests/TestErrorReporter.hpp @@ -149,7 +149,6 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase { } }; -#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) template struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase { @@ -178,7 +177,6 @@ struct ErrorReporterDriverUseLambda driver_base::check_expectations(reporter_capacity, test_size); } }; -#endif #ifdef KOKKOS_ENABLE_OPENMP struct ErrorReporterDriverNativeOpenMP diff --git a/containers/unit_tests/TestOffsetView.hpp b/containers/unit_tests/TestOffsetView.hpp index c133922e3d..c225d65b69 100644 --- a/containers/unit_tests/TestOffsetView.hpp +++ b/containers/unit_tests/TestOffsetView.hpp @@ -67,7 +67,6 @@ void test_offsetview_construction() { ASSERT_EQ(ov.extent(0), 5u); ASSERT_EQ(ov.extent(1), 5u); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) { Kokkos::Experimental::OffsetView offsetV1("OneDOffsetView", range0); @@ -149,7 +148,6 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; -#endif { offset_view_type ovCopy(ov); @@ -184,7 +182,6 @@ void test_offsetview_construction() { range3_type rangePolicy3DZero(point3_type{{0, 0, 0}}, point3_type{{extent0, extent1, extent2}}); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -207,7 +204,6 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; -#endif } view_type viewFromOV = ov.view(); @@ -232,7 +228,6 @@ void test_offsetview_construction() { view_type aView("aView", ov.extent(0), ov.extent(1)); Kokkos::deep_copy(aView, ov); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -242,7 +237,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; -#endif } { // test view to offsetview deep copy @@ -251,7 +245,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, 99); Kokkos::deep_copy(ov, aView); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -261,7 +254,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; -#endif } } @@ -429,7 +421,6 @@ void test_offsetview_subview() { ASSERT_EQ(offsetSubview.begin(1), 0); ASSERT_EQ(offsetSubview.end(1), 9); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -455,7 +446,6 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); -#endif } // slice 2 @@ -552,7 +542,6 @@ void test_offsetview_subview() { } } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) template KOKKOS_INLINE_FUNCTION T std_accumulate(InputIt first, InputIt last, T init, BinaryOperation op) { @@ -655,7 +644,6 @@ void test_offsetview_offsets_rank3() { ASSERT_EQ(0, errors); } -#endif TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction(); @@ -669,7 +657,6 @@ TEST(TEST_CATEGORY, offsetview_subview) { test_offsetview_subview(); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) TEST(TEST_CATEGORY, offsetview_offsets_rank1) { test_offsetview_offsets_rank1(); } @@ -681,7 +668,6 @@ TEST(TEST_CATEGORY, offsetview_offsets_rank2) { TEST(TEST_CATEGORY, offsetview_offsets_rank3) { test_offsetview_offsets_rank3(); } -#endif } // namespace Test diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 66319f43f5..2361e45ce6 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -173,12 +173,10 @@ KOKKOS_ADD_BENCHMARK( SOURCES ${BENCHMARK_SOURCES} ) -IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) KOKKOS_ADD_BENCHMARK( Benchmark_Atomic_MinMax SOURCES test_atomic_minmax_simple.cpp ) -ENDIF() # FIXME_NVHPC IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) diff --git a/core/perf_test/PerfTest_ViewAllocate.cpp b/core/perf_test/PerfTest_ViewAllocate.cpp index 63f1d6b2c7..8ee69cfa59 100644 --- a/core/perf_test/PerfTest_ViewAllocate.cpp +++ b/core/perf_test/PerfTest_ViewAllocate.cpp @@ -217,7 +217,6 @@ BENCHMARK(ViewAllocate_Rank8) ->Arg(N) ->UseManualTime(); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewAllocate_Raw) ->ArgName("N") ->Arg(N) @@ -227,6 +226,5 @@ BENCHMARK(ViewAllocate_Raw) ->ArgName("N") ->Arg(N) ->UseManualTime(); -#endif } // namespace Test diff --git a/core/perf_test/PerfTest_ViewCopy_Raw.cpp b/core/perf_test/PerfTest_ViewCopy_Raw.cpp index 67a8d7e555..e4db40e128 100644 --- a/core/perf_test/PerfTest_ViewCopy_Raw.cpp +++ b/core/perf_test/PerfTest_ViewCopy_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) @@ -38,6 +37,5 @@ BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) ->UseManualTime(); -#endif } // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill_Raw.cpp b/core/perf_test/PerfTest_ViewFill_Raw.cpp index c11074d915..57bba83a9c 100644 --- a/core/perf_test/PerfTest_ViewFill_Raw.cpp +++ b/core/perf_test/PerfTest_ViewFill_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) @@ -28,6 +27,5 @@ BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) ->UseManualTime(); -#endif } // namespace Test diff --git a/core/perf_test/PerfTest_ViewResize_Raw.cpp b/core/perf_test/PerfTest_ViewResize_Raw.cpp index 2d1bcbb3ca..ab469cb647 100644 --- a/core/perf_test/PerfTest_ViewResize_Raw.cpp +++ b/core/perf_test/PerfTest_ViewResize_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewResize_NoInit_Raw) ->ArgName("N") ->Arg(N) @@ -30,6 +29,5 @@ BENCHMARK(ViewResize_NoInit_Raw) ->Arg(N) ->UseManualTime() ->Iterations(R); -#endif } // namespace Test diff --git a/core/unit_test/TestMDRangeReduce.hpp b/core/unit_test/TestMDRangeReduce.hpp index 007fa420c3..24bd3255fe 100644 --- a/core/unit_test/TestMDRangeReduce.hpp +++ b/core/unit_test/TestMDRangeReduce.hpp @@ -49,8 +49,6 @@ TEST(TEST_CATEGORY, mdrange_parallel_reduce_primitive_types) { #if defined(KOKKOS_ENABLE_OPENMPTARGET) GTEST_SKIP() << "FIXME OPENMPTARGET Tests of MDRange reduce over values " "smaller than int would fail"; -#elif defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA) - GTEST_SKIP() << "Skipped ENABLE_CUDA_LAMBDA"; #else for (int bound : {0, 1, 7, 32, 65, 7000}) { for (int k = 0; k < bound; ++k) { diff --git a/core/unit_test/TestTeamMDRange.hpp b/core/unit_test/TestTeamMDRange.hpp index 8ac7e8338c..7f4068a09b 100644 --- a/core/unit_test/TestTeamMDRange.hpp +++ b/core/unit_test/TestTeamMDRange.hpp @@ -148,10 +148,6 @@ struct TestTeamMDParallelFor { } }; -// If KOKKOS_ENABLE_CUDA_LAMBDA is off, extended lambdas used in parallel_for -// and parallel_reduce in these tests will not compile correctly -#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) - template struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { using TeamType = typename Kokkos::TeamPolicy::member_type; @@ -1963,7 +1959,5 @@ TEST(TEST_CATEGORY, TeamVectorMDRangeParallelReduce) { test_parallel_reduce_for_8D_TeamVectorMDRange(smallDims); } -#endif - } // namespace TeamMDRange } // namespace Test From c28472a6498f2a1c240fa2b8fe0adcd4475673df Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 19 Apr 2023 12:24:34 -0600 Subject: [PATCH 401/496] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c5222cdab8..43960e29ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -122,6 +122,7 @@ - Add missing `ReductionIdentity` specialization [\#5798](https://github.com/kokkos/kokkos/pull/5798) - Don't install standard algorithms headers multiple times [\#5670](https://github.com/kokkos/kokkos/pull/5670) - Fix max scratch size calculation for level 0 scratch in CUDA and HIP [\#5718](https://github.com/kokkos/kokkos/pull/5718) +- Fix excessive build times using Makefile.kokkos [\#6068](https://github.com/kokkos/kokkos/pull/6068) ## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01) From e8067d4bea4e89431e33c74d54264c9d45adc79b Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 3 May 2023 12:57:27 -0600 Subject: [PATCH 402/496] [ci skip] Fixup changelog Move 6068 entry to 4.0.01 Another fix for 4.0.0 changelog (fix link for tree) --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 43960e29ef..5645cfe906 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,8 +29,9 @@ - Fix an incorrectly returning size for SIMD uint64_t in AVX2 [\#6011](https://github.com/kokkos/kokkos/pull/6011) - Desul atomics: wrong value for `desul::Impl::numeric_limits_max` [\#6018](https://github.com/kokkos/kokkos/pull/6018) - Fix warning in some user code when using std::memcpy [\#6000](https://github.com/kokkos/kokkos/pull/6000) +- Fix excessive build times using Makefile.kokkos [\#6068](https://github.com/kokkos/kokkos/pull/6068) -## [4.0.0](https://github.com/kokkos/kokkos/tree/4.0.0) (2023-02-21) +## [4.0.0](https://github.com/kokkos/kokkos/tree/4.0.00) (2023-02-21) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.01...4.0.00) ### Features: @@ -122,7 +123,6 @@ - Add missing `ReductionIdentity` specialization [\#5798](https://github.com/kokkos/kokkos/pull/5798) - Don't install standard algorithms headers multiple times [\#5670](https://github.com/kokkos/kokkos/pull/5670) - Fix max scratch size calculation for level 0 scratch in CUDA and HIP [\#5718](https://github.com/kokkos/kokkos/pull/5718) -- Fix excessive build times using Makefile.kokkos [\#6068](https://github.com/kokkos/kokkos/pull/6068) ## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01) From d251954e06c006aeb1347c462783ae351bc04bd0 Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Wed, 3 May 2023 13:40:08 -0600 Subject: [PATCH 403/496] Work around nvcc issue for view_mapping and add FIXME_NVCC comment --- core/unit_test/TestViewMapping_a.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/core/unit_test/TestViewMapping_a.hpp b/core/unit_test/TestViewMapping_a.hpp index 9173f0d431..dc576577c2 100644 --- a/core/unit_test/TestViewMapping_a.hpp +++ b/core/unit_test/TestViewMapping_a.hpp @@ -1038,16 +1038,16 @@ void test_view_mapping() { ASSERT_EQ(a.use_count(), 1); ASSERT_EQ(b.use_count(), 0); -#if !defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_ENABLE_CUDA_LAMBDA) - // Cannot launch host lambda when CUDA lambda is enabled. - using host_exec_space = typename Kokkos::Impl::HostMirror::Space::execution_space; int errors = 0; Kokkos::parallel_reduce( Kokkos::RangePolicy(0, 10), - KOKKOS_LAMBDA(int, int& e) { + // FIXME_NVCC: Cannot launch __host__ __device__ lambda on + // host when CUDA lambda is enabled, so use plain [=] instead + // of KOKKOS_LAMBDA + [=](int, int& e) { // an unmanaged copy. When the parallel dispatch accepts a move for // the lambda, this count should become 1. @@ -1058,7 +1058,6 @@ void test_view_mapping() { }, errors); ASSERT_EQ(errors, 0); -#endif // #if !defined( KOKKOS_ENABLE_CUDA_LAMBDA ) } } From 4b6d971dce856132961b1f178dcae1256b92b733 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Wed, 3 May 2023 13:33:48 -0700 Subject: [PATCH 404/496] OpenMPTarget: Update hierarchical parallelism. (#6043) * OpenMPTarget: Update hierarchical parallelism. * OpenMPTarget: Update initialize routine. * OpenMPTarget: Remove num_teams for Intel GPUs. * OpenMPTarget: fix comment. * OpenMPTarget: Oversubscribe number of teams. * OpenMPTarget: Move KOKKOS_IMPL_HIERARCHICAL_INTEL_GPU macro to a central location. * OpenMPTarget: Add num_teams clause for Intel GPUs too. * OpenMPTarget: Moving the undef for Intel GPUs into files that include the macro. * OpenMPTarget: Updated macro name and added to print_configuration. * OpenMPTarget: Adding impl to macro. * OpenMPTarget: Fix typo for Intel GPUs. * OpenMPTarget: Fix print_configuration. * OpenMPTarget: Rename variable names. * OpenMPTarget: clang format. --------- Co-authored-by: Rahulkumar Gayatri --- .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp | 17 +- .../Kokkos_OpenMPTarget_Instance.cpp | 30 +++- .../Kokkos_OpenMPTarget_Parallel.hpp | 9 +- .../Kokkos_OpenMPTarget_ParallelFor_Team.hpp | 66 ++++--- .../Kokkos_OpenMPTarget_Parallel_Common.hpp | 169 ++++++++++-------- 5 files changed, 190 insertions(+), 101 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp index 45b8f42f17..02905572e1 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp @@ -67,6 +67,8 @@ void OpenMPTargetExec::verify_initialized(const char* const label) { msg.append(" ERROR: not initialized"); Kokkos::Impl::throw_runtime_exception(msg); } + OpenMPTargetExec::MAX_ACTIVE_THREADS = + Kokkos::Experimental::OpenMPTarget().concurrency(); } void* OpenMPTargetExec::m_scratch_ptr = nullptr; @@ -74,6 +76,7 @@ int64_t OpenMPTargetExec::m_scratch_size = 0; int* OpenMPTargetExec::m_lock_array = nullptr; uint64_t OpenMPTargetExec::m_lock_size = 0; uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; +int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; void OpenMPTargetExec::clear_scratch() { Kokkos::Experimental::OpenMPTargetSpace space; @@ -100,11 +103,23 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, const int64_t shmem_size = shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. const int64_t padding = shmem_size * 10 / 100; // Padding per team. + + // Maximum active teams possible. + // The number should not exceed the maximum in-flight teams possible or the + // league_size. + int max_active_teams = + std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + + // max_active_teams is the number of active teams on the given hardware. + // We set the number of teams to be twice the number of max_active_teams for + // the compiler to pick the right number in its case. + omp_set_num_teams(max_active_teams * 2); + // Total amount of scratch memory allocated is depenedent // on the maximum number of in-flight teams possible. int64_t total_size = (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * - std::min(MAX_ACTIVE_THREADS / team_size, league_size); + max_active_teams * 2; if (total_size > m_scratch_size) { space.deallocate(m_scratch_ptr, m_scratch_size); diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 02f42ee2a6..3999920517 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -66,13 +67,40 @@ void OpenMPTargetInternal::fence(const std::string& name, } } int OpenMPTargetInternal::concurrency() const { - return 128000; // FIXME_OPENMPTARGET + int max_threads = 2048 * 80; +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + int max_threads_sm = 2048; +#if defined(KOKKOS_ARCH_AMPERE86) + max_threads = max_threads_sm * 84; +#elif defined(KOKKOS_ARCH_AMPERE80) + max_threads = max_threads_sm * 108; +#elif defined(KOKKOS_ARCH_VOLTA72) + max_threads = max_threads_sm * 84; +#elif defined(KOKKOS_ARCH_VOLTA70) + max_threads = max_threads_sm * 80; +#elif defined(KOKKOS_ARCH_PASCAL60) || defined(KOKKOS_ARCH_PASCAL61) + max_threads = max_threads_sm * 60; +#endif +#elif defined(KOKKOS_ARCH_INTEL_GPU) +#pragma omp target map(max_threads) + { max_threads = omp_get_num_procs(); } + + // Multiply the number of processors with the SIMD length. + max_threads *= 32; +#endif + + return max_threads; } const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; } void OpenMPTargetInternal::print_configuration(std::ostream& os, bool /*verbose*/) const { // FIXME_OPENMPTARGET os << "Using OpenMPTarget\n"; +#if defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) + os << "Defined KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU: Workaround " + "for " + "hierarchical parallelism for Intel GPUs."; +#endif } void OpenMPTargetInternal::impl_finalize() { diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 50167e297b..9767d8e53e 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -26,6 +26,12 @@ #include #include "Kokkos_OpenMPTarget_Abort.hpp" +// Intel architectures prefer the classical hierarchical parallelism that relies +// on OpenMP. +#if defined(KOKKOS_ARCH_INTEL_GPU) +#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -727,8 +733,7 @@ class OpenMPTargetExec { // teams possible is calculated based on NVIDIA's Volta GPU. In // future this value should be based on the chosen architecture for the // OpenMPTarget backend. - static constexpr int MAX_ACTIVE_THREADS = 2080 * 80; - static constexpr int MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS / 32; + static int MAX_ACTIVE_THREADS; private: static void* scratch_ptr; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp index 12de3423f8..4aefbc96cd 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -115,44 +115,68 @@ class ParallelFor, // mode but works in the Debug mode. // Maximum active teams possible. - int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; - // nteams should not exceed the maximum in-flight teams possible. - const auto nteams = - league_size < max_active_teams ? league_size : max_active_teams; + int max_active_teams = omp_get_max_teams(); + + // FIXME_OPENMPTARGET: Although the maximum number of teams is set using the + // omp_set_num_teams in the resize_scratch routine, the call is not + // respected. Hence we need to use `num_teams` routine to restrict the + // number of teams generated to max_active_teams. Hopefully we can avoid the + // num_teams clause in the future and let compiler pick the right number of + // teams. This is not true for Intel architectures. // If the league size is <=0, do not launch the kernel. - if (nteams <= 0) return; + if (max_active_teams <= 0) return; // Performing our own scheduling of teams to avoid separation of code between // teams-distribute and parallel. Gave a 2x performance boost in test cases with // the clang compiler. atomic_compare_exchange can be avoided since the standard // guarantees that the number of teams specified in the `num_teams` clause is // always less than or equal to the maximum concurrently running teams. -#pragma omp target teams num_teams(nteams) thread_limit(team_size) \ - map(to \ - : a_functor) is_device_ptr(scratch_ptr) +#if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) +#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \ + num_teams(max_active_teams) is_device_ptr(scratch_ptr) #pragma omp parallel { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + const int blockIdx = omp_get_team_num(); const int gridDim = omp_get_num_teams(); // Iterate through the number of teams until league_size and assign the // league_id accordingly // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename Policy::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - m_functor(team); - else - m_functor(TagType(), team); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename Policy::member_type team(league_id, league_size, team_size, + vector_length, scratch_ptr, blockIdx, + shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + m_functor(team); + else + m_functor(TagType(), team); + } } +#else +#pragma omp target teams distribute firstprivate(a_functor) \ + is_device_ptr(scratch_ptr) num_teams(max_active_teams) \ + thread_limit(team_size) + for (int i = 0; i < league_size; i++) { +#pragma omp parallel + { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + + typename Policy::member_type team(i, league_size, team_size, + vector_length, scratch_ptr, i, + shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + m_functor(team); + else + m_functor(TagType(), team); + } + } +#endif } public: diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp index 2ce25f9ffd..ceb1337c58 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp @@ -106,7 +106,7 @@ struct ParallelReduceSpecialize, : f) reduction(custom \ : result) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { + if constexpr (std::is_void_v) { f(i, result); } else { f(TagType(), i, result); @@ -138,13 +138,14 @@ struct ParallelReduceSpecialize, ptr_on_device); return; } + // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { #pragma omp target teams distribute parallel for \ map(to:f) reduction(+: result) for (auto i = begin; i < end; ++i) - if constexpr (std::is_void::value) { + if constexpr (std::is_void_v) { f(i, result); } else { f(TagType(), i, result); @@ -156,7 +157,7 @@ struct ParallelReduceSpecialize, : result) for (auto i = begin; i < end; ++i) - if constexpr (std::is_void::value) { + if constexpr (std::is_void_v) { f(i, result); } else { f(TagType(), i, result); @@ -178,7 +179,7 @@ struct ParallelReduceSpecialize, } #pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions]) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { + if constexpr (std::is_void_v) { f(i, result); } else { f(TagType(), i, result); @@ -261,7 +262,7 @@ struct ParallelReduceSpecialize, // Accumulate partial results in thread specific storage. #pragma omp for simd for (auto i = team_begin; i < team_end; ++i) { - if constexpr (std::is_void::value) { + if constexpr (std::is_void_v) { f(i, result); } else { f(TagType(), i, result); @@ -355,42 +356,60 @@ struct ParallelReduceSpecialize, ValueType result = ValueType(); // Maximum active teams possible. - int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; - const auto nteams = - league_size < max_active_teams ? league_size : max_active_teams; + int max_active_teams = omp_get_max_teams(); // If the league size is <=0, do not launch the kernel. - if (nteams <= 0) return; + if (max_active_teams <= 0) return; #pragma omp declare reduction( \ custom:ValueType \ : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(custom \ - : result) +#if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ + firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom \ + : result) #pragma omp parallel reduction(custom : result) { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + const int blockIdx = omp_get_team_num(); const int gridDim = omp_get_num_teams(); // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + f(team, result); + else + f(TagType(), team, result); + } + } +#else +#pragma omp target teams distribute firstprivate(f) is_device_ptr(scratch_ptr) \ + num_teams(max_active_teams) thread_limit(team_size) reduction(custom \ + : result) + for (int i = 0; i < league_size; i++) { +#pragma omp parallel reduction(custom : result) + { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + + typename PolicyType::member_type team(i, league_size, team_size, + vector_length, scratch_ptr, i, + shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + f(team, result); + else + f(TagType(), team, result); + } } +#endif // Copy results back to device if `parallel_reduce` is on a device view. ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), @@ -416,12 +435,10 @@ struct ParallelReduceSpecialize, void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); // Maximum active teams possible. - int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; - const auto nteams = - league_size < max_active_teams ? league_size : max_active_teams; + int max_active_teams = omp_get_max_teams(); // If the league size is <=0, do not launch the kernel. - if (nteams <= 0) return; + if (max_active_teams <= 0) return; // Case where the number of reduction items is 1. if constexpr (NumReductions == 1) { @@ -429,55 +446,55 @@ struct ParallelReduceSpecialize, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ : f) \ is_device_ptr(scratch_ptr) reduction(+: result) #pragma omp parallel reduction(+ : result) { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + const int blockIdx = omp_get_team_num(); const int gridDim = omp_get_num_teams(); // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + f(team, result); + else + f(TagType(), team, result); + } } } else { // Case where the reduction is on a non-native data type. #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(custom \ - : result) +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ + map(to \ + : f) is_device_ptr(scratch_ptr) reduction(custom \ + : result) #pragma omp parallel reduction(custom : result) { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + const int blockIdx = omp_get_team_num(); const int gridDim = omp_get_num_teams(); // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + f(team, result); + else + f(TagType(), team, result); + } } } @@ -487,28 +504,28 @@ struct ParallelReduceSpecialize, } else { ValueType result[NumReductions] = {}; // Case where the reduction is on an array. -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ : f) \ is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions]) #pragma omp parallel reduction(+ : result[:NumReductions]) { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + const int blockIdx = omp_get_team_num(); const int gridDim = omp_get_num_teams(); // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + f(team, result); + else + f(TagType(), team, result); + } } // Copy results back to device if `parallel_reduce` is on a device view. @@ -593,7 +610,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, team_num, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) { + if constexpr (std::is_void_v) { f(team, result); } else { f(TagType(), team, result); From 7a166d2e4c76f4dd09486dbbbbed7637eb56e213 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 3 May 2023 18:39:12 -0400 Subject: [PATCH 405/496] Enable OpenMP in CUDA-11.0-NVCC-RDC to test DEPRECATED_CODE_3=ON (#5978) * Enable OpenMP in CUDA-11.0-NVCC-RDC to test DEPRECATED_CODE_3=ON * Drop build_cmake_installed_different_compiler * KOKKOS_EANBLE_DEPRECATED_CODE_3: impl_thread_pool_size is not static * Use gcc-8.4.0 jenkins CI instead --- .jenkins | 1 + core/unit_test/TestDefaultDeviceTypeInit.hpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.jenkins b/.jenkins index b7591d1a23..fcbcf56073 100644 --- a/.jenkins +++ b/.jenkins @@ -489,6 +489,7 @@ pipeline { -DCMAKE_CXX_FLAGS=-Werror \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_TESTS=ON \ diff --git a/core/unit_test/TestDefaultDeviceTypeInit.hpp b/core/unit_test/TestDefaultDeviceTypeInit.hpp index 7ae73b14d3..929c91db4e 100644 --- a/core/unit_test/TestDefaultDeviceTypeInit.hpp +++ b/core/unit_test/TestDefaultDeviceTypeInit.hpp @@ -262,7 +262,7 @@ void check_correct_initialization(const Kokkos::InitArguments& argstruct) { #endif } - ASSERT_EQ(Kokkos::HostSpace::execution_space::impl_thread_pool_size(), + ASSERT_EQ(Kokkos::HostSpace::execution_space().impl_thread_pool_size(), expected_nthreads); #ifdef KOKKOS_ENABLE_CUDA From a45cc1eff0e461942a66a13345fe215eee059c2c Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Thu, 4 May 2023 03:24:37 +0200 Subject: [PATCH 406/496] fix ternary op in subset of std algorithms not working with nvhpc (#6095) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix std algorithms for nvhpc This fixes tests disabled in fb8179f4bae685e8fc29c9fdd890b41e4c8b92ff Co-authored-by: Cezary Skrzyński * fix index type * improve comments * address PR comments * fix comments per CI suggestions * revert is_sort_until --- .../impl/Kokkos_AdjacentFind.hpp | 11 +++++---- .../std_algorithms/impl/Kokkos_FindEnd.hpp | 8 ++++--- .../impl/Kokkos_FindFirstOf.hpp | 9 +++---- .../impl/Kokkos_FindIfOrNot.hpp | 9 +++---- .../impl/Kokkos_IsPartitioned.hpp | 8 +++++-- .../impl/Kokkos_LexicographicalCompare.hpp | 14 ++++++----- .../std_algorithms/impl/Kokkos_Mismatch.hpp | 9 +++---- .../impl/Kokkos_PartitionPoint.hpp | 11 +++++---- .../src/std_algorithms/impl/Kokkos_Search.hpp | 8 ++++--- .../std_algorithms/impl/Kokkos_SearchN.hpp | 8 ++++--- .../TestStdAlgorithmsAdjacentFind.cpp | 6 ----- .../TestStdAlgorithmsAllAnyNoneOf.cpp | 6 ----- .../unit_tests/TestStdAlgorithmsFind.cpp | 6 ----- .../unit_tests/TestStdAlgorithmsFindEnd.cpp | 6 ----- .../TestStdAlgorithmsFindFirstOf.cpp | 6 ----- .../TestStdAlgorithmsIsSortedUntil.cpp | 6 ----- ...estStdAlgorithmsLexicographicalCompare.cpp | 6 ----- .../unit_tests/TestStdAlgorithmsMismatch.cpp | 6 ----- .../TestStdAlgorithmsPartitioningOps.cpp | 24 ------------------- .../unit_tests/TestStdAlgorithmsSearch.cpp | 6 ----- .../unit_tests/TestStdAlgorithmsSearch_n.cpp | 6 ----- 21 files changed, 57 insertions(+), 122 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp index cc6b63f028..dd785e603b 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp @@ -42,12 +42,13 @@ struct StdAdjacentFindFunctor { const auto& next_value = m_first[i + 1]; const bool are_equal = m_p(my_value, next_value); - auto rv = - are_equal - ? red_value_type{i} - : red_value_type{::Kokkos::reduction_identity::min()}; + // FIXME_NVHPC using a ternary operator causes problems + red_value_type value = {::Kokkos::reduction_identity::min()}; + if (are_equal) { + value.min_loc_true = i; + } - m_reducer.join(red_value, rv); + m_reducer.join(red_value, value); } KOKKOS_FUNCTION diff --git a/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp b/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp index 3fa41af8ea..3ec64fa43d 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp @@ -59,9 +59,11 @@ struct StdFindEndFunctor { } } - const auto rv = - found ? red_value_type{i} - : red_value_type{::Kokkos::reduction_identity::max()}; + // FIXME_NVHPC using a ternary operator causes problems + red_value_type rv = {::Kokkos::reduction_identity::max()}; + if (found) { + rv.max_loc_true = i; + } m_reducer.join(red_value, rv); } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp b/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp index df10da2fd5..5f22d2ad13 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp @@ -52,10 +52,11 @@ struct StdFindFirstOfFunctor { } } - const auto rv = - found ? red_value_type{i} - : red_value_type{::Kokkos::reduction_identity::min()}; - + // FIXME_NVHPC using a ternary operator causes problems + red_value_type rv = {::Kokkos::reduction_identity::min()}; + if (found) { + rv.min_loc_true = i; + } m_reducer.join(red_value, rv); } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp b/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp index f7ec4b1110..9c0b0c0ccd 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp @@ -44,10 +44,11 @@ struct StdFindIfOrNotFunctor { // if doing find_if_not, look for when predicate is false const bool found_condition = is_find_if ? m_p(my_value) : !m_p(my_value); - auto rv = - found_condition - ? red_value_type{i} - : red_value_type{::Kokkos::reduction_identity::min()}; + // FIXME_NVHPC using a ternary operator causes problems + red_value_type rv = {::Kokkos::reduction_identity::min()}; + if (found_condition) { + rv.min_loc_true = i; + } m_reducer.join(red_value, rv); } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp b/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp index 92a22f3c3a..0fe2d246ff 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp @@ -43,8 +43,12 @@ struct StdIsPartitionedFunctor { ::Kokkos::reduction_identity::min(); constexpr index_type m_red_id_max = ::Kokkos::reduction_identity::max(); - auto rv = predicate_value ? red_value_type{i, m_red_id_min} - : red_value_type{m_red_id_max, i}; + + // FIXME_NVHPC using a ternary operator causes problems + red_value_type rv = {m_red_id_max, i}; + if (predicate_value) { + rv = {i, m_red_id_min}; + } m_reducer.join(redValue, rv); } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp b/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp index 170ec9f291..ad7f59232e 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp @@ -63,12 +63,14 @@ struct StdLexicographicalCompareFunctor { const auto& my_value1 = m_first1[i]; const auto& my_value2 = m_first2[i]; - bool different = m_comparator(my_value1, my_value2) || - m_comparator(my_value2, my_value1); - auto rv = - different - ? red_value_type{i} - : red_value_type{::Kokkos::reduction_identity::min()}; + const bool different = m_comparator(my_value1, my_value2) || + m_comparator(my_value2, my_value1); + + // FIXME_NVHPC using a ternary operator causes problems + red_value_type rv = {::Kokkos::reduction_identity::min()}; + if (different) { + rv.min_loc_true = i; + } m_reducer.join(red_value, rv); } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Mismatch.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Mismatch.hpp index 9d2e31f63f..b742684467 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Mismatch.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Mismatch.hpp @@ -42,10 +42,11 @@ struct StdMismatchRedFunctor { const auto& my_value1 = m_first1[i]; const auto& my_value2 = m_first2[i]; - auto rv = - !m_predicate(my_value1, my_value2) - ? red_value_type{i} - : red_value_type{::Kokkos::reduction_identity::min()}; + // FIXME_NVHPC using a ternary operator causes problems + red_value_type rv = {i}; + if (m_predicate(my_value1, my_value2)) { + rv = {::Kokkos::reduction_identity::min()}; + } m_reducer.join(red_value, rv); } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_PartitionPoint.hpp b/algorithms/src/std_algorithms/impl/Kokkos_PartitionPoint.hpp index 2d0ae2aac6..c9517f6977 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_PartitionPoint.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_PartitionPoint.hpp @@ -39,10 +39,13 @@ struct StdPartitionPointFunctor { KOKKOS_FUNCTION void operator()(const index_type i, red_value_type& redValue) const { const auto predicate_value = m_p(m_first[i]); - auto rv = - predicate_value - ? red_value_type{::Kokkos::reduction_identity::min()} - : red_value_type{i}; + + // FIXME_NVHPC using a ternary operator causes problems + red_value_type rv = {i}; + if (predicate_value) { + rv = {::Kokkos::reduction_identity::min()}; + } + m_reducer.join(redValue, rv); } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Search.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Search.hpp index a612a57231..2780151f29 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Search.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Search.hpp @@ -60,9 +60,11 @@ struct StdSearchFunctor { } } - const auto rv = - found ? red_value_type{i} - : red_value_type{::Kokkos::reduction_identity::min()}; + // FIXME_NVHPC using a ternary operator causes problems + red_value_type rv = {::Kokkos::reduction_identity::min()}; + if (found) { + rv = {i}; + } m_reducer.join(red_value, rv); } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_SearchN.hpp b/algorithms/src/std_algorithms/impl/Kokkos_SearchN.hpp index 0d3b6bc706..98640136d4 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_SearchN.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_SearchN.hpp @@ -59,9 +59,11 @@ struct StdSearchNFunctor { } } - const auto rv = - found ? red_value_type{i} - : red_value_type{::Kokkos::reduction_identity::min()}; + // FIXME_NVHPC using a ternary operator causes problems + red_value_type rv = {::Kokkos::reduction_identity::min()}; + if (found) { + rv.min_loc_true = i; + } m_reducer.join(red_value, rv); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp b/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp index ee34761265..6fc9d583f3 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp @@ -287,12 +287,6 @@ void run_all_scenarios() { } TEST(std_algorithms_nonmod_seq_ops, adjacent_find) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsAllAnyNoneOf.cpp b/algorithms/unit_tests/TestStdAlgorithmsAllAnyNoneOf.cpp index 1c39a4735e..cccc0f6c18 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsAllAnyNoneOf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsAllAnyNoneOf.cpp @@ -147,12 +147,6 @@ void run_all_scenarios() { } TEST(std_algorithms_all_any_none_of_test, test) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsFind.cpp b/algorithms/unit_tests/TestStdAlgorithmsFind.cpp index 3b8b5e85af..5407bab224 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsFind.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsFind.cpp @@ -151,12 +151,6 @@ void run_all_scenarios() { } TEST(std_algorithms_find_test, test) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp b/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp index ddc4bc1ba6..c9e213962b 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp @@ -348,12 +348,6 @@ void run_all_scenarios() { } TEST(std_algorithms_non_mod_seq_ops, find_end) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp b/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp index c2f7a2fdb8..e9141bd27b 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp @@ -264,12 +264,6 @@ void run_all_scenarios() { } TEST(std_algorithms_non_mod_seq_ops, find_first_of) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index ce8669a84f..6053c6ca57 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -185,12 +185,6 @@ void run_is_sorted_until_all_scenarios() { } TEST(std_algorithms_sorting_ops_test, is_sorted_until) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif run_is_sorted_until_all_scenarios(); run_is_sorted_until_all_scenarios(); run_is_sorted_until_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp b/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp index 2acd4934ac..2d4f1afdd0 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp @@ -140,12 +140,6 @@ void run_all_scenarios() { } TEST(std_algorithms_lexicographical_compare_test, test) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif // FIXME: should this disable only custom comparator tests? #if !defined KOKKOS_ENABLE_OPENMPTARGET run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp b/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp index bb4b6fb2a2..774329eef7 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp @@ -189,12 +189,6 @@ void run_all_scenarios() { } TEST(std_algorithms_mismatch_test, test) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp b/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp index 1bfb536c2c..94ec278af1 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp @@ -148,12 +148,6 @@ struct std_algorithms_partitioning_test : public std_algorithms_test { }; TEST_F(std_algorithms_partitioning_test, is_partitioned_trivial) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif IsNegativeFunctor p; const auto result1 = KE::is_partitioned(exespace(), KE::cbegin(m_static_view), KE::cbegin(m_static_view), p); @@ -169,12 +163,6 @@ TEST_F(std_algorithms_partitioning_test, is_partitioned_trivial) { } TEST_F(std_algorithms_partitioning_test, is_partitioned_accepting_iterators) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif const IsNegativeFunctor p; for (int id = 0; id < FixtureViews::Count; ++id) { @@ -196,12 +184,6 @@ TEST_F(std_algorithms_partitioning_test, is_partitioned_accepting_iterators) { } TEST_F(std_algorithms_partitioning_test, is_partitioned_accepting_view) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif const IsNegativeFunctor p; for (int id = 0; id < FixtureViews::Count; ++id) { @@ -220,12 +202,6 @@ TEST_F(std_algorithms_partitioning_test, is_partitioned_accepting_view) { } TEST_F(std_algorithms_partitioning_test, partition_point) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif const IsNegativeFunctor p; for (int id = 0; id < FixtureViews::Count; ++id) { diff --git a/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp b/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp index ab4bf50713..c25b82a245 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp @@ -325,12 +325,6 @@ void run_all_scenarios() { } TEST(std_algorithms_non_mod_seq_ops, search) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif run_all_scenarios(); run_all_scenarios(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp b/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp index a6fe9c1e89..68e2b1bf0f 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp @@ -297,12 +297,6 @@ void run_all_scenarios() { } TEST(std_algorithms_non_mod_seq_ops, search_n) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - if constexpr (std::is_same_v) { - GTEST_SKIP() << "FIXME wrong result"; - } -#endif run_all_scenarios(); run_all_scenarios(); } From ab41ef8a41760e882fa5c161f31b60e0a43f4f49 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 May 2023 17:00:18 -0400 Subject: [PATCH 407/496] Add implementation of bit_cast in --- core/src/Kokkos_BitManipulation.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index 50cd92c986..04c9d4b2d2 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -20,6 +20,7 @@ #include #include #include // CHAR_BIT +#include //memcpy #include namespace Kokkos::Impl { @@ -98,6 +99,19 @@ inline constexpr bool is_standard_unsigned_integer_type_v = namespace Kokkos { +// +template +KOKKOS_FUNCTION std::enable_if_t && + std::is_trivially_copyable_v, + To> +bit_cast(From const& from) noexcept { + To to; + memcpy(&to, &from, sizeof(To)); + return to; +} +// + // template KOKKOS_FUNCTION constexpr std::enable_if_t, T> byteswap( From 71ee48ffd9c65bb624a8cff6181000418f7dcabf Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 May 2023 17:03:05 -0400 Subject: [PATCH 408/496] Add compile time tests for the constraints on the bit_cast function template --- core/unit_test/TestBitManipulation.cpp | 58 ++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/core/unit_test/TestBitManipulation.cpp b/core/unit_test/TestBitManipulation.cpp index 407596c2bb..08fc5f6283 100644 --- a/core/unit_test/TestBitManipulation.cpp +++ b/core/unit_test/TestBitManipulation.cpp @@ -484,3 +484,61 @@ static_assert(test_byteswap2()); // #undef TEST_BIT_MANIPULATION + +// +template +constexpr auto test_bit_cast() -> typename std::is_same< + decltype(Kokkos::bit_cast(std::declval())), + To>::value_type { + static_assert( + std::is_same_v< + decltype(Kokkos::bit_cast(std::declval())), To>); + return true; +} +template +constexpr X test_bit_cast(...) { + return {}; +} + +namespace TypesNotTheSameSize { +struct To { + char a; +}; +struct From { + char b; + char c; +}; +static_assert(test_bit_cast().did_not_match()); +} // namespace TypesNotTheSameSize + +namespace ToNotTriviallyCopyable { +struct To { + char a; + To(To const &); +}; +struct From { + char b; +}; +static_assert(test_bit_cast().did_not_match()); +} // namespace ToNotTriviallyCopyable + +namespace FromNotTriviallyCopyable { +struct To { + char a; +}; +struct From { + char b; + From(From const &); +}; +static_assert(test_bit_cast().did_not_match()); +} // namespace FromNotTriviallyCopyable + +namespace ReturnTypeIllFormed { +struct From { + char a; + char b; +}; +static_assert(test_bit_cast().did_not_match()); +static_assert(test_bit_cast().did_not_match()); +} // namespace ReturnTypeIllFormed + // From ddf55c1d59d0e1305af12f35280c5591d4c81ed7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 May 2023 17:13:15 -0400 Subject: [PATCH 409/496] Add the Experimental:: builtin variant (just defer to regular bit_cast) --- core/src/Kokkos_BitManipulation.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index 04c9d4b2d2..f1dd4c12e2 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -392,6 +392,15 @@ KOKKOS_IMPL_HOST_FUNCTION namespace Kokkos::Experimental { +template +KOKKOS_FUNCTION std::enable_if_t && + std::is_trivially_copyable_v, + To> +bit_cast_builtin(From const& from) noexcept { + return bit_cast(from); // no benefit to call the _builtin variant +} + template KOKKOS_FUNCTION std::enable_if_t, T> byteswap_builtin( T x) noexcept { From e8a44e579f587a064052b2d5fc3f7fcc74e2a920 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 May 2023 22:52:06 -0400 Subject: [PATCH 410/496] Add runtime tests for bit_cast --- .../unit_test/TestBitManipulationBuiltins.hpp | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp index 3b5e7a3db5..935bb93ff6 100644 --- a/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/core/unit_test/TestBitManipulationBuiltins.hpp @@ -765,3 +765,90 @@ TEST(TEST_CATEGORY, bit_manip_byeswap) { test_bit_manip_byteswap(); test_bit_manip_byteswap(); } + +// CUDA doesn't provide memcpy +KOKKOS_FUNCTION int my_memcmp(void const* lhs, void const* rhs, size_t count) { + auto u1 = static_cast(lhs); + auto u2 = static_cast(rhs); + while (count-- != 0) { + if (*u1 != *u2) { + return (*u1 < *u2) ? -1 : +1; + } + ++u1; + ++u2; + } + return 0; +} + +template +struct TestBitCastFunction { + TestBitCastFunction() { run(); } + void run() const { + int errors = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, 1), *this, errors); + ASSERT_EQ(errors, 0) << "Failed check no error for bit_cast()"; + } + template + static KOKKOS_FUNCTION bool check(const From& from) { + using Kokkos::Experimental::bit_cast_builtin; + return bit_cast_builtin(bit_cast_builtin(from)) == from; + } + + KOKKOS_FUNCTION void operator()(int, int& e) const { + using Kokkos::bit_cast; + if (bit_cast(123) != 123) { + ++e; + KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #1\n"); + } + if (bit_cast(123u) != 123) { + ++e; + KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #2\n"); + } + if (bit_cast(~0u) != ~0) { + ++e; + KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #3\n"); + } + if constexpr (sizeof(int) == sizeof(float)) { + if (!check(12.34f)) { + ++e; + KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #4\n"); + } + } + if constexpr (sizeof(unsigned long long) == sizeof(double)) { + if (!check(123.456)) { + ++e; + KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #5\n"); + } + } + + struct S { + int i; + + KOKKOS_FUNCTION bool operator==(const char* s) const { + return my_memcmp(&i, s, sizeof(i)) == 0; + } + }; + char arr[sizeof(int)]; + char arr2[sizeof(int)]; + for (size_t i = 0; i < sizeof(int); ++i) { + arr[i] = i + 1; + arr2[i] = (i + 1) * -(i % 2); + } + if (!(bit_cast(arr) == arr)) { + ++e; + KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #6\n"); + } + if (!(bit_cast(arr2) == arr2)) { + ++e; + KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #7\n"); + } + } +}; + +TEST(TEST_CATEGORY, bit_manip_bit_cast) { + using Kokkos::bit_cast; + ASSERT_EQ(bit_cast(123), 123); + ASSERT_EQ(bit_cast(123u), 123); + ASSERT_EQ(bit_cast(~0u), ~0); + TestBitCastFunction(); +} From eff2716b89487e94408e213dfd70d910c9de0726 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 May 2023 22:55:38 -0400 Subject: [PATCH 411/496] Use Kokkos::bit_cast in SIMD instead of rolling its own --- simd/src/Kokkos_SIMD_AVX2.hpp | 1 + simd/src/Kokkos_SIMD_AVX512.hpp | 1 + simd/src/Kokkos_SIMD_Common.hpp | 8 -------- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index eacbfa2393..8577ebede8 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -21,6 +21,7 @@ #include #include +#include // bit_cast #include diff --git a/simd/src/Kokkos_SIMD_AVX512.hpp b/simd/src/Kokkos_SIMD_AVX512.hpp index b44666c770..8da7120640 100644 --- a/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/simd/src/Kokkos_SIMD_AVX512.hpp @@ -21,6 +21,7 @@ #include #include +#include // bit_cast #include diff --git a/simd/src/Kokkos_SIMD_Common.hpp b/simd/src/Kokkos_SIMD_Common.hpp index c29d49fb3a..2159a0e933 100644 --- a/simd/src/Kokkos_SIMD_Common.hpp +++ b/simd/src/Kokkos_SIMD_Common.hpp @@ -26,14 +26,6 @@ namespace Kokkos { namespace Experimental { -template -[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION constexpr To bit_cast( - From const& src) { - To dst; - std::memcpy(&dst, &src, sizeof(To)); - return dst; -} - template class simd; From 432988bcdadd348c7951b910b71872e3ac139981 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 May 2023 23:17:45 -0400 Subject: [PATCH 412/496] Clang-format glitch --- core/unit_test/TestBitManipulation.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestBitManipulation.cpp b/core/unit_test/TestBitManipulation.cpp index 08fc5f6283..560c407b16 100644 --- a/core/unit_test/TestBitManipulation.cpp +++ b/core/unit_test/TestBitManipulation.cpp @@ -30,7 +30,7 @@ struct X { static_assert(test_##FUNC((bool)0).did_not_match()); \ static_assert(test_##FUNC((int)0).did_not_match()); \ static_assert(test_##FUNC((float)0).did_not_match()); \ - static_assert(test_##FUNC((void*)0).did_not_match()) + static_assert(test_##FUNC((void *)0).did_not_match()) // template @@ -442,7 +442,7 @@ constexpr auto test_byteswap(T x) -> decltype(Kokkos::byteswap(x)) { constexpr X test_byteswap(...) { return {}; } -static_assert(test_byteswap((void*)0).did_not_match()); // NOLINT +static_assert(test_byteswap((void *)0).did_not_match()); // NOLINT static_assert(test_byteswap((float)0).did_not_match()); constexpr char c2[2] = {}; static_assert(test_byteswap(c2).did_not_match()); From 5c2d948b0a94934e7ea48c2e9971caadde7dfbd5 Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Thu, 23 Mar 2023 12:52:18 +0000 Subject: [PATCH 413/496] view(uvm): fence if need in allocation (#6005) --- containers/src/Kokkos_DualView.hpp | 7 +-- containers/src/Kokkos_DynRankView.hpp | 25 --------- containers/src/Kokkos_OffsetView.hpp | 25 --------- .../unit_tests/TestWithoutInitializing.hpp | 54 +++++++++---------- core/src/Cuda/Kokkos_CudaSpace.cpp | 11 ++-- core/src/Kokkos_View.hpp | 25 --------- 6 files changed, 35 insertions(+), 112 deletions(-) diff --git a/containers/src/Kokkos_DualView.hpp b/containers/src/Kokkos_DualView.hpp index 07256b4464..bef2149f4c 100644 --- a/containers/src/Kokkos_DualView.hpp +++ b/containers/src/Kokkos_DualView.hpp @@ -239,7 +239,8 @@ class DualView : public ViewTraits { : modified_flags(t_modified_flags("DualView::modified_flags")), d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7) { // without UVM, host View mirrors - if (Kokkos::Impl::has_type::value) + if constexpr (Kokkos::Impl::has_type::value) h_view = Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); else h_view = Kokkos::create_mirror_view(d_view); @@ -576,8 +577,8 @@ class DualView : public ViewTraits { impl_report_host_sync(); } } - if (std::is_same::value) { + if constexpr (std::is_same::value) { typename t_dev::execution_space().fence( "Kokkos::DualView<>::sync: fence after syncing DualView"); typename t_host::execution_space().fence( diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 5b47323cb7..864362eda7 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1089,37 +1089,12 @@ class DynRankView : public ViewTraits { "execution space"); } -//------------------------------------------------------------ -#if defined(KOKKOS_ENABLE_CUDA) - // If allocating in CudaUVMSpace must fence before and after - // the allocation to protect against possible concurrent access - // on the CPU and the GPU. - // Fence using the trait's executon space (which will be Kokkos::Cuda) - // to avoid incomplete type errors from usng Kokkos::Cuda directly. - if (std::is_same::value) { - typename traits::device_type::memory_space::execution_space().fence( - "Kokkos::DynRankView<>::DynRankView: fence before UVM allocation"); - } -#endif - //------------------------------------------------------------ - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( prop_copy, Impl::DynRankDimTraits:: template createLayout(arg_prop, arg_layout), Impl::ViewCtorProp::has_execution_space); -//------------------------------------------------------------ -#if defined(KOKKOS_ENABLE_CUDA) - if (std::is_same::value) { - typename traits::device_type::memory_space::execution_space().fence( - "Kokkos::DynRankView<>::DynRankView: fence after UVM allocation"); - } -#endif - //------------------------------------------------------------ - // Setup and initialization complete, start tracking m_track.assign_allocated_record_to_uninitialized(record); } diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 22b65f3f9f..876a0d8b98 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -1191,35 +1191,10 @@ class OffsetView : public ViewTraits { "execution space"); } - //------------------------------------------------------------ -#if defined(KOKKOS_ENABLE_CUDA) - // If allocating in CudaUVMSpace must fence before and after - // the allocation to protect against possible concurrent access - // on the CPU and the GPU. - // Fence using the trait's executon space (which will be Kokkos::Cuda) - // to avoid incomplete type errors from usng Kokkos::Cuda directly. - if (std::is_same::value) { - typename traits::device_type::memory_space::execution_space().fence( - "Kokkos::OffsetView::OffsetView(): fence before UVM allocation"); - } -#endif - //------------------------------------------------------------ - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( prop_copy, arg_layout, Kokkos::Impl::ViewCtorProp::has_execution_space); - //------------------------------------------------------------ -#if defined(KOKKOS_ENABLE_CUDA) - if (std::is_same::value) { - typename traits::device_type::memory_space::execution_space().fence( - "Kokkos::OffsetView::OffsetView(): fence after UVM allocation"); - } -#endif - //------------------------------------------------------------ - // Setup and initialization complete, start tracking m_track.assign_allocated_record_to_uninitialized(record); diff --git a/containers/unit_tests/TestWithoutInitializing.hpp b/containers/unit_tests/TestWithoutInitializing.hpp index 0554ddd1a5..7201cd402a 100644 --- a/containers/unit_tests/TestWithoutInitializing.hpp +++ b/containers/unit_tests/TestWithoutInitializing.hpp @@ -24,6 +24,19 @@ #include <../../core/unit_test/tools/include/ToolTestingUtilities.hpp> +/// Some tests are skipped for @c CudaUVM memory space. +/// @todo To be revised according to the future of @c KOKKOS_ENABLE_CUDA_UVM. +///@{ +#ifdef KOKKOS_ENABLE_CUDA +#define GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE \ + if constexpr (std::is_same_v) \ + GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences"; +#else +#define GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE +#endif +///@} + TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); @@ -125,11 +138,7 @@ TEST(TEST_CATEGORY, resize_exec_space_dualview) { } TEST(TEST_CATEGORY, realloc_exec_space_dualview) { -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) - GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences"; -#endif + GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableFences()); @@ -221,11 +230,8 @@ TEST(TEST_CATEGORY, resize_exec_space_dynrankview) { } TEST(TEST_CATEGORY, realloc_exec_space_dynrankview) { -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) - GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences"; -#endif + GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE + // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS if (std::is_same::value) @@ -363,11 +369,8 @@ TEST(TEST_CATEGORY, resize_exec_space_scatterview) { } TEST(TEST_CATEGORY, realloc_exec_space_scatterview) { -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) - GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences"; -#endif + GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE + // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS if (std::is_same::value) @@ -477,11 +480,8 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynrankview_viewctor) { } TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynrankview) { -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) - return; -#endif + GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); @@ -584,11 +584,8 @@ TEST(TEST_CATEGORY, create_mirror_no_init_offsetview_view_ctor) { } TEST(TEST_CATEGORY, create_mirror_view_and_copy_offsetview) { -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) - return; -#endif + GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); @@ -659,11 +656,8 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) { } TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) { -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) - return; -#endif + GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp index 5fb4f86414..45c5ddaf2a 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -172,10 +172,11 @@ void *impl_allocate_common(const Cuda &exec_space, const char *arg_label, if (exec_space_provided) { cudaStream_t stream = exec_space.cuda_stream(); error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); + exec_space.fence("Kokkos::Cuda: backend fence after async malloc"); } else { error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + Impl::cuda_device_synchronize( + "Kokkos::Cuda: backend fence after async malloc"); } } else { error_code = cudaMalloc(&ptr, arg_alloc_size); @@ -324,9 +325,11 @@ void CudaSpace::impl_deallocate( #error CUDART_VERSION undefined! #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) if (arg_alloc_size >= memory_threshold_g) { - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + Impl::cuda_device_synchronize( + "Kokkos::Cuda: backend fence before async free"); KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, 0)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + Impl::cuda_device_synchronize( + "Kokkos::Cuda: backend fence after async free"); } else { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); } diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 098991bf0c..bcbb28014c 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -1418,34 +1418,9 @@ class View : public ViewTraits { std::is_same::value, i0, i1, i2, i3, i4, i5, i6, i7, alloc_name); -//------------------------------------------------------------ -#if defined(KOKKOS_ENABLE_CUDA) - // If allocating in CudaUVMSpace must fence before and after - // the allocation to protect against possible concurrent access - // on the CPU and the GPU. - // Fence using the trait's execution space (which will be Kokkos::Cuda) - // to avoid incomplete type errors from using Kokkos::Cuda directly. - if (std::is_same::value) { - typename traits::device_type::memory_space::execution_space().fence( - "Kokkos::View<...>::View: fence before allocating UVM"); - } -#endif - //------------------------------------------------------------ - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); -//------------------------------------------------------------ -#if defined(KOKKOS_ENABLE_CUDA) - if (std::is_same::value) { - typename traits::device_type::memory_space::execution_space().fence( - "Kokkos::View<...>::View: fence after allocating UVM"); - } -#endif - //------------------------------------------------------------ - // Setup and initialization complete, start tracking m_track.m_tracker.assign_allocated_record_to_uninitialized(record); } From 7533cb407ac8fbd4890b7b6bc56f62c454657e4d Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 4 May 2023 08:32:56 -0400 Subject: [PATCH 414/496] Disable tests that fail at runtime with NVHPC (likely not liking the class declaration within the body of the functor) --- core/unit_test/TestBitManipulationBuiltins.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp index 935bb93ff6..90aa4073a7 100644 --- a/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/core/unit_test/TestBitManipulationBuiltins.hpp @@ -821,6 +821,12 @@ struct TestBitCastFunction { } } +#if defined(KOKKOS_ENABLE_CUDA) && \ + defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC + if constexpr (std::is_same_v) { + return; + } +#endif struct S { int i; From 26ae798fda2cf83c299540e5051940c537cb787d Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Thu, 4 May 2023 18:57:30 +0200 Subject: [PATCH 415/496] change impl of `is_sorted_until` to use reduce (#6097) * change impl of is_sorted_until to use reduce * address comments --- .../impl/Kokkos_IsSortedUntil.hpp | 84 ++++++++----------- 1 file changed, 36 insertions(+), 48 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp b/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp index fe52e18a33..2a0c112bf5 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp @@ -28,33 +28,30 @@ namespace Kokkos { namespace Experimental { namespace Impl { -template +template struct StdIsSortedUntilFunctor { using index_type = typename IteratorType::difference_type; + using value_type = typename ReducerType::value_type; + IteratorType m_first; - IndicatorViewType m_indicator; ComparatorType m_comparator; + ReducerType m_reducer; KOKKOS_FUNCTION - void operator()(const index_type i, int& update, const bool final) const { + void operator()(const index_type i, value_type& reduction_result) const { const auto& val_i = m_first[i]; const auto& val_ip1 = m_first[i + 1]; - if (m_comparator(val_ip1, val_i)) { - ++update; - } - - if (final) { - m_indicator(i) = update; + m_reducer.join(reduction_result, i); } } KOKKOS_FUNCTION - StdIsSortedUntilFunctor(IteratorType _first1, IndicatorViewType indicator, - ComparatorType comparator) - : m_first(std::move(_first1)), - m_indicator(std::move(indicator)), - m_comparator(std::move(comparator)) {} + StdIsSortedUntilFunctor(IteratorType first, ComparatorType comparator, + ReducerType reducer) + : m_first(std::move(first)), + m_comparator(std::move(comparator)), + m_reducer(std::move(reducer)) {} }; template @@ -73,40 +70,31 @@ IteratorType is_sorted_until_impl(const std::string& label, } /* - use scan and a helper "indicator" view - such that we scan the data and fill the indicator with - partial sum that is always 0 unless we find a pair that - breaks the sorting, so in that case the indicator will - have a 1 starting at the location where the sorting breaks. - So finding that 1 means finding the location we want. - */ - - // aliases - using indicator_value_type = std::size_t; - using indicator_view_type = - ::Kokkos::View; - using functor_type = - StdIsSortedUntilFunctor; - - // do scan - // use num_elements-1 because each index handles i and i+1 - const auto num_elements_minus_one = num_elements - 1; - indicator_view_type indicator("is_sorted_until_indicator_helper", - num_elements_minus_one); - ::Kokkos::parallel_scan( - label, RangePolicy(ex, 0, num_elements_minus_one), - functor_type(first, indicator, std::move(comp))); - - // try to find the first sentinel value, which indicates - // where the sorting condition breaks - namespace KE = ::Kokkos::Experimental; - constexpr indicator_value_type sentinel_value = 1; - auto r = - KE::find(ex, KE::cbegin(indicator), KE::cend(indicator), sentinel_value); - const auto shift = r - ::Kokkos::Experimental::cbegin(indicator); - - return first + (shift + 1); + Do a par_reduce computing the *min* index that breaks the sorting. + If such an index is found, then the range is sorted until that element. + If no such index is found, then the range is sorted until the end. + */ + using index_type = typename IteratorType::difference_type; + index_type reduction_result; + ::Kokkos::Min reducer(reduction_result); + ::Kokkos::parallel_reduce( + label, + // use num_elements-1 because each index handles i and i+1 + RangePolicy(ex, 0, num_elements - 1), + // use CTAD + StdIsSortedUntilFunctor(first, comp, reducer), reducer); + + /* If the reduction result is equal to the initial value, + it means the range is sorted until the end */ + index_type reduction_result_init; + reducer.init(reduction_result_init); + if (reduction_result == reduction_result_init) { + return last; + } else { + /* If such an index is found, then the range is sorted until there and + we need to return an iterator past the element found so do +1 */ + return first + (reduction_result + 1); + } } template From 8dc8f49730c6019c0fc32d583f40ed1885509696 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 4 May 2023 18:50:25 -0400 Subject: [PATCH 416/496] Fix typo and remove accidentally committed assertions Co-authored-by: Daniel Arndt --- core/unit_test/TestBitManipulationBuiltins.hpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp index 90aa4073a7..9adf22774a 100644 --- a/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/core/unit_test/TestBitManipulationBuiltins.hpp @@ -766,7 +766,7 @@ TEST(TEST_CATEGORY, bit_manip_byeswap) { test_bit_manip_byteswap(); } -// CUDA doesn't provide memcpy +// CUDA doesn't provide memcmp KOKKOS_FUNCTION int my_memcmp(void const* lhs, void const* rhs, size_t count) { auto u1 = static_cast(lhs); auto u2 = static_cast(rhs); @@ -852,9 +852,5 @@ struct TestBitCastFunction { }; TEST(TEST_CATEGORY, bit_manip_bit_cast) { - using Kokkos::bit_cast; - ASSERT_EQ(bit_cast(123), 123); - ASSERT_EQ(bit_cast(123u), 123); - ASSERT_EQ(bit_cast(~0u), ~0); TestBitCastFunction(); } From e247508644d8e1262ffb92289c9bc474342ea37a Mon Sep 17 00:00:00 2001 From: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> Date: Fri, 5 May 2023 07:08:01 -0600 Subject: [PATCH 417/496] Added multiple reducers support for team-level parallel reduce (#5727) * Added interfaces and unit-tests for combined reducers supports for TeamThreadRange, ThreadVectorRange, TeamVectorRange * fixed warnings from unit tests * Fixed errors from CI tests * Added n=0 test cases * fixed warnings from unit tests * Removed team combined reducers unit-test file from OpenACC and OpenMPTarget tests list * quick syntax fix * Adjusted unit test skip conditions for openmptarget and openacc * Put in a macro guard to check if KOKKOS_ENABLE_CUDA_LAMBDA is defined * Addressing comments from reviews * Update core/src/impl/Kokkos_Combined_Reducer.hpp Co-authored-by: Daniel Arndt * Update core/unit_test/TestTeamCombinedReducers.hpp Co-authored-by: Damien L-G * Converted write_one_value_back_on_device to a static function * Clang-formatted * git rebasing * Removed unnecessary fences from parallel_reduce_impl * Adjusted unit tests based on feedbacks * Adjusted expect_eq values in the unit tests * Removed a few ternary conditions --------- Co-authored-by: Daniel Arndt Co-authored-by: Damien L-G --- core/src/impl/Kokkos_Combined_Reducer.hpp | 142 +++--- core/unit_test/CMakeLists.txt | 3 + core/unit_test/TestTeamCombinedReducers.hpp | 515 ++++++++++++++++++++ 3 files changed, 596 insertions(+), 64 deletions(-) create mode 100644 core/unit_test/TestTeamCombinedReducers.hpp diff --git a/core/src/impl/Kokkos_Combined_Reducer.hpp b/core/src/impl/Kokkos_Combined_Reducer.hpp index d7304779c7..d1c443ab08 100644 --- a/core/src/impl/Kokkos_Combined_Reducer.hpp +++ b/core/src/impl/Kokkos_Combined_Reducer.hpp @@ -210,6 +210,8 @@ struct CombinedReducerImpl, Space, ...); } + KOKKOS_FUNCTION auto& reference() const { return *m_value_view.data(); } + // TODO figure out if we also need to call through to final KOKKOS_FUNCTION @@ -247,6 +249,22 @@ struct CombinedReducerImpl, Space, ...); } + + template + KOKKOS_FUNCTION static void write_one_value_back_on_device( + View const& inputView, typename View::const_value_type& value) noexcept { + *inputView.data() = value; + } + + template + KOKKOS_FUNCTION void write_value_back_to_original_references_on_device( + value_type const& value, + CombinedReducers const&... reducers_that_reference_original_values) noexcept { + (write_one_value_back_on_device( + reducers_that_reference_original_values.view(), + value.template get()), + ...); + } }; // Apparently this can't be an alias template because of a bug/unimplemented @@ -466,9 +484,9 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer( //---------------------------------------- } -template +template KOKKOS_INLINE_FUNCTION constexpr auto make_wrapped_combined_functor( - Functor const& functor, Space, ReferencesOrViewsOrReducers&&...) { + Functor const& functor, ReferencesOrViewsOrReducers&&...) { //---------------------------------------- return CombinedReductionFunctorWrapper< Functor, Space, @@ -478,6 +496,32 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_wrapped_combined_functor( template using functor_has_value_t = typename FunctorType::value_type; + +template +KOKKOS_INLINE_FUNCTION void parallel_reduce_combined_reducers_impl( + BoundaryStructType const& boundaries, Functor const& functor, + ReturnType1&& returnType1, ReturnType2&& returnType2, + ReturnTypes&&... returnTypes) noexcept { + using mem_space_type = typename MemberType::execution_space::memory_space; + + auto combined_value = Impl::make_combined_reducer_value( + returnType1, returnType2, returnTypes...); + + auto combined_functor = Impl::make_wrapped_combined_functor( + functor, returnType1, returnType2, returnTypes...); + + auto combined_reducer = Impl::make_combined_reducer( + combined_value, returnType1, returnType2, returnTypes...); + + parallel_reduce(boundaries, combined_functor, combined_reducer); + + combined_reducer.write_value_back_to_original_references_on_device( + combined_value, Impl::_make_reducer_from_arg(returnType1), + Impl::_make_reducer_from_arg(returnType2), + Impl::_make_reducer_from_arg(returnTypes)...); +} + } // end namespace Impl //============================================================================== @@ -509,8 +553,8 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy, auto combined_reducer = Impl::make_combined_reducer( value, returnType1, returnType2, returnTypes...); - auto combined_functor = Impl::make_wrapped_combined_functor( - functor, space_type{}, returnType1, returnType2, returnTypes...); + auto combined_functor = Impl::make_wrapped_combined_functor( + functor, returnType1, returnType2, returnTypes...); using combined_functor_type = decltype(combined_functor); static_assert( @@ -577,66 +621,36 @@ void parallel_reduce(size_t n, Functor const& functor, //------------------------------------------------------------------------------ // {{{2 -// Copied three times because that's the best way we have right now to match -// Impl::TeamThreadRangeBoundariesStruct, -// Impl::ThreadVectorRangeBoundariesStruct, and -// Impl::TeamVectorRangeBoundariesStruct. -// TODO make these work after restructuring - -// template -// KOKKOS_INLINE_FUNCTION void parallel_reduce( -// std::string const& label, -// Impl::TeamThreadRangeBoundariesStruct const& -// boundaries, Functor const& functor, ReturnType1&& returnType1, -// ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept { -// const auto combined_reducer = -// Impl::make_combined_reducer( -// returnType1, returnType2, returnTypes...); -// -// auto combined_functor = Impl::make_wrapped_combined_functor( -// functor, Kokkos::AnonymousSpace{}, returnType1, returnType2, -// returnTypes...); -// -// parallel_reduce(label, boundaries, combined_functor, combined_reducer); -//} -// -// template -// KOKKOS_INLINE_FUNCTION void parallel_reduce( -// std::string const& label, -// Impl::ThreadVectorRangeBoundariesStruct const& -// boundaries, -// Functor const& functor, ReturnType1&& returnType1, -// ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept { -// const auto combined_reducer = -// Impl::make_combined_reducer( -// returnType1, returnType2, returnTypes...); -// -// auto combined_functor = Impl::make_wrapped_combined_functor( -// functor, Kokkos::AnonymousSpace{}, returnType1, returnType2, -// returnTypes...); -// -// parallel_reduce(label, boundaries, combined_functor, combined_reducer); -//} - -// template -// KOKKOS_INLINE_FUNCTION void parallel_reduce( -// std::string const& label, -// Impl::TeamVectorRangeBoundariesStruct const& -// boundaries, Functor const& functor, ReturnType1&& returnType1, -// ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept { -// const auto combined_reducer = -// Impl::make_combined_reducer( -// returnType1, returnType2, returnTypes...); -// -// auto combined_functor = Impl::make_wrapped_combined_functor( -// functor, Kokkos::AnonymousSpace{}, returnType1, returnType2, -// returnTypes...); -// -// parallel_reduce(label, boundaries, combined_functor, combined_reducer); -//} +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + Impl::TeamThreadRangeBoundariesStruct const& boundaries, + Functor const& functor, ReturnType1&& returnType1, + ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept { + Impl::parallel_reduce_combined_reducers_impl( + boundaries, functor, returnType1, returnType2, returnTypes...); +} + +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + Impl::ThreadVectorRangeBoundariesStruct const& + boundaries, + Functor const& functor, ReturnType1&& returnType1, + ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept { + Impl::parallel_reduce_combined_reducers_impl( + boundaries, functor, returnType1, returnType2, returnTypes...); +} + +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + Impl::TeamVectorRangeBoundariesStruct const& boundaries, + Functor const& functor, ReturnType1&& returnType1, + ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept { + Impl::parallel_reduce_combined_reducers_impl( + boundaries, functor, returnType1, returnType2, returnTypes...); +} // end Team overloads }}}2 //------------------------------------------------------------------------------ diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 9fbe7e92e0..94a3ac285f 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -203,6 +203,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) SET(${Tag}_SOURCES2A) foreach(Name TeamBasic + TeamCombinedReducers TeamMDRange TeamPolicyConstructors TeamReductionScan @@ -356,6 +357,7 @@ endforeach() if(Kokkos_ENABLE_OPENMPTARGET) list(REMOVE_ITEM OpenMPTarget_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_WorkGraph.cpp ) @@ -373,6 +375,7 @@ if(Kokkos_ENABLE_OPENACC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Other.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamMDRange.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamReductionScan.cpp diff --git a/core/unit_test/TestTeamCombinedReducers.hpp b/core/unit_test/TestTeamCombinedReducers.hpp new file mode 100644 index 0000000000..47c2f666c9 --- /dev/null +++ b/core/unit_test/TestTeamCombinedReducers.hpp @@ -0,0 +1,515 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +// Extended lambdas in parallel_for and parallel_reduce will not compile if +// KOKKOS_ENABLE_CUDA_LAMBDA is off +#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) + +struct TeamTeamCombinedReducer { + public: + void test_team_thread_range_only_scalars(const int n) { + auto policy = Kokkos::TeamPolicy(1, Kokkos::AUTO); + using team_member_type = decltype(policy)::member_type; + + auto teamView = Kokkos::View("view"); + + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(team_member_type const& team) { + auto teamThreadRange = Kokkos::TeamThreadRange(team, n); + int teamResult0, teamResult1, teamResult2, teamResult3; + + Kokkos::parallel_reduce( + teamThreadRange, + [=](int const& i, int& localVal0, int& localVal1, int& localVal2, + int& localVal3) { + localVal0 += 1; + localVal1 += i + 1; + localVal2 += (i + 1) * n; + localVal3 += n; + }, + teamResult0, teamResult1, teamResult2, teamResult3); + + Kokkos::single(Kokkos::PerTeam(team), [=]() { + teamView(0) = teamResult0; + teamView(1) = teamResult1; + teamView(2) = teamResult2; + teamView(3) = teamResult3; + }); + }); + + auto hostView = Kokkos::create_mirror_view_and_copy( + Kokkos::DefaultHostExecutionSpace(), teamView); + + if (n == 0) { + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(0)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(1)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(2)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(3)); + } else { + EXPECT_EQ(n, hostView(0)); + EXPECT_EQ((n * (n + 1) / 2), hostView(1)); + EXPECT_EQ(n * n * (n + 1) / 2, hostView(2)); + EXPECT_EQ(n * n, hostView(3)); + } + } + + void test_team_thread_range_only_builtin(const int n) { + auto policy = Kokkos::TeamPolicy(1, Kokkos::AUTO); + using team_member_type = decltype(policy)::member_type; + + auto teamView = Kokkos::View("view"); + + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(team_member_type const& team) { + auto teamThreadRange = Kokkos::TeamThreadRange(team, n); + int teamResult0, teamResult1, teamResult2, teamResult3; + + Kokkos::parallel_reduce( + teamThreadRange, + [=](int const& i, int& localVal0, int& localVal1, int& localVal2, + int& localVal3) { + localVal0 += i + 1; + localVal1 *= n; + localVal2 = (localVal2 > (i + 1)) ? (i + 1) : localVal2; + localVal3 = (localVal3 < (i + 1)) ? (i + 1) : localVal3; + }, + Kokkos::Sum(teamResult0), Kokkos::Prod(teamResult1), + Kokkos::Min(teamResult2), Kokkos::Max(teamResult3)); + + Kokkos::single(Kokkos::PerTeam(team), [=]() { + teamView(0) = teamResult0; + teamView(1) = teamResult1; + teamView(2) = teamResult2; + teamView(3) = teamResult3; + }); + }); + + auto hostView = Kokkos::create_mirror_view_and_copy( + Kokkos::DefaultHostExecutionSpace(), teamView); + + if (n == 0) { + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(0)); + EXPECT_EQ(Kokkos::reduction_identity::prod(), hostView(1)); + EXPECT_EQ(Kokkos::reduction_identity::min(), hostView(2)); + EXPECT_EQ(Kokkos::reduction_identity::max(), hostView(3)); + } else { + EXPECT_EQ((n * (n + 1) / 2), hostView(0)); + EXPECT_EQ(std::pow(n, n), hostView(1)); + EXPECT_EQ(1, hostView(2)); + EXPECT_EQ(n, hostView(3)); + } + } + + void test_team_thread_range_combined_reducers(const int n) { + auto policy = Kokkos::TeamPolicy(1, Kokkos::AUTO); + using team_member_type = decltype(policy)::member_type; + + auto teamView = Kokkos::View("view", 4); + + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(team_member_type const& team) { + auto teamThreadRange = Kokkos::TeamThreadRange(team, n); + int teamResult0, teamResult1, teamResult2, teamResult3; + + Kokkos::parallel_reduce( + teamThreadRange, + [=](int const& i, int& localVal0, int& localVal1, int& localVal2, + int& localVal3) { + localVal0 += i + 1; + localVal1 += i + 1; + localVal2 = (localVal2 < (i + 1)) ? (i + 1) : localVal2; + localVal3 += n; + }, + teamResult0, Kokkos::Sum(teamResult1), + Kokkos::Max(teamResult2), teamResult3); + + Kokkos::single(Kokkos::PerTeam(team), [=]() { + teamView(0) = teamResult0; + teamView(1) = teamResult1; + teamView(2) = teamResult2; + teamView(3) = teamResult3; + }); + }); + + auto hostView = Kokkos::create_mirror_view_and_copy( + Kokkos::DefaultHostExecutionSpace(), teamView); + + if (n == 0) { + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(0)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(1)); + EXPECT_EQ(Kokkos::reduction_identity::max(), hostView(2)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(3)); + } else { + EXPECT_EQ((n * (n + 1) / 2), hostView(0)); + EXPECT_EQ((n * (n + 1) / 2), hostView(1)); + EXPECT_EQ(n, hostView(2)); + EXPECT_EQ(n * n, hostView(3)); + } + } + + void test_thread_vector_range_only_scalars(const int n) { + auto policy = Kokkos::TeamPolicy(1, Kokkos::AUTO); + using team_member_type = decltype(policy)::member_type; + + auto teamView = Kokkos::View("view"); + + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(team_member_type const& team) { + auto teamThreadRange = Kokkos::TeamThreadRange(team, 1); + auto threadVectorRange = Kokkos::ThreadVectorRange(team, n); + int teamResult0, teamResult1, teamResult2, teamResult3; + + Kokkos::parallel_for(teamThreadRange, [&](int const&) { + Kokkos::parallel_reduce( + threadVectorRange, + [=](int const& i, int& localVal0, int& localVal1, + int& localVal2, int& localVal3) { + localVal0 += 1; + localVal1 += i + 1; + localVal2 += (i + 1) * n; + localVal3 += n; + }, + teamResult0, teamResult1, teamResult2, teamResult3); + + teamView(0) = teamResult0; + teamView(1) = teamResult1; + teamView(2) = teamResult2; + teamView(3) = teamResult3; + }); + }); + + auto hostView = Kokkos::create_mirror_view_and_copy( + Kokkos::DefaultHostExecutionSpace(), teamView); + + if (n == 0) { + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(0)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(1)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(2)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(3)); + } else { + EXPECT_EQ(n, hostView(0)); + EXPECT_EQ((n * (n + 1) / 2), hostView(1)); + EXPECT_EQ(n * n * (n + 1) / 2, hostView(2)); + EXPECT_EQ(n * n, hostView(3)); + } + } + + void test_thread_vector_range_only_builtin(const int n) { + auto policy = Kokkos::TeamPolicy(1, Kokkos::AUTO); + using team_member_type = decltype(policy)::member_type; + + auto teamView = Kokkos::View("view"); + + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(team_member_type const& team) { + auto teamThreadRange = Kokkos::TeamThreadRange(team, 1); + auto threadVectorRange = Kokkos::ThreadVectorRange(team, n); + int teamResult0, teamResult1, teamResult2, teamResult3; + + Kokkos::parallel_for(teamThreadRange, [&](int const&) { + Kokkos::parallel_reduce( + threadVectorRange, + [=](int const& i, int& localVal0, int& localVal1, + int& localVal2, int& localVal3) { + localVal0 += i + 1; + localVal1 *= n; + localVal2 = (localVal2 > (i + 1)) ? (i + 1) : localVal2; + localVal3 = (localVal3 < (i + 1)) ? (i + 1) : localVal3; + }, + Kokkos::Sum(teamResult0), Kokkos::Prod(teamResult1), + Kokkos::Min(teamResult2), Kokkos::Max(teamResult3)); + + teamView(0) = teamResult0; + teamView(1) = teamResult1; + teamView(2) = teamResult2; + teamView(3) = teamResult3; + }); + }); + + auto hostView = Kokkos::create_mirror_view_and_copy( + Kokkos::DefaultHostExecutionSpace(), teamView); + + if (n == 0) { + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(0)); + EXPECT_EQ(Kokkos::reduction_identity::prod(), hostView(1)); + EXPECT_EQ(Kokkos::reduction_identity::min(), hostView(2)); + EXPECT_EQ(Kokkos::reduction_identity::max(), hostView(3)); + } else { + EXPECT_EQ((n * (n + 1) / 2), hostView(0)); + EXPECT_EQ(std::pow(n, n), hostView(1)); + EXPECT_EQ(1, hostView(2)); + EXPECT_EQ(n, hostView(3)); + } + } + + void test_thread_vector_range_combined_reducers(const int n) { + auto policy = Kokkos::TeamPolicy(1, Kokkos::AUTO); + using team_member_type = decltype(policy)::member_type; + + auto teamView = Kokkos::View("view"); + + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(team_member_type const& team) { + auto teamThreadRange = Kokkos::TeamThreadRange(team, 1); + auto threadVectorRange = Kokkos::ThreadVectorRange(team, n); + int teamResult0, teamResult1, teamResult2, teamResult3; + + Kokkos::parallel_for(teamThreadRange, [&](int const&) { + Kokkos::parallel_reduce( + threadVectorRange, + [=](int const& i, int& localVal0, int& localVal1, + int& localVal2, int& localVal3) { + localVal0 *= n; + localVal1 += i + 1; + localVal2 = (localVal2 > (i + 1)) ? (i + 1) : localVal2; + localVal3 += n; + }, + Kokkos::Prod(teamResult0), teamResult1, + Kokkos::Min(teamResult2), teamResult3); + + teamView(0) = teamResult0; + teamView(1) = teamResult1; + teamView(2) = teamResult2; + teamView(3) = teamResult3; + }); + }); + + auto hostView = Kokkos::create_mirror_view_and_copy( + Kokkos::DefaultHostExecutionSpace(), teamView); + + if (n == 0) { + EXPECT_EQ(Kokkos::reduction_identity::prod(), hostView(0)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(1)); + EXPECT_EQ(Kokkos::reduction_identity::min(), hostView(2)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(3)); + } else { + EXPECT_EQ(std::pow(n, n), hostView(0)); + EXPECT_EQ((n * (n + 1) / 2), hostView(1)); + EXPECT_EQ(1, hostView(2)); + EXPECT_EQ(n * n, hostView(3)); + } + } + + void test_team_vector_range_only_scalars(const int n) { + auto policy = Kokkos::TeamPolicy(1, Kokkos::AUTO); + using team_member_type = decltype(policy)::member_type; + + auto teamView = Kokkos::View("view"); + + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(team_member_type const& team) { + auto teamVectorRange = Kokkos::TeamVectorRange(team, n); + int teamResult0, teamResult1, teamResult2, teamResult3; + + Kokkos::parallel_reduce( + teamVectorRange, + [=](int const& i, int& localVal0, int& localVal1, int& localVal2, + int& localVal3) { + localVal0 += 1; + localVal1 += i + 1; + localVal2 += (i + 1) * n; + localVal3 += n; + }, + teamResult0, teamResult1, teamResult2, teamResult3); + + Kokkos::single(Kokkos::PerTeam(team), [=]() { + teamView(0) = teamResult0; + teamView(1) = teamResult1; + teamView(2) = teamResult2; + teamView(3) = teamResult3; + }); + }); + + auto hostView = Kokkos::create_mirror_view_and_copy( + Kokkos::DefaultHostExecutionSpace(), teamView); + + if (n == 0) { + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(0)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(1)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(2)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(3)); + } else { + EXPECT_EQ(n, hostView(0)); + EXPECT_EQ((n * (n + 1) / 2), hostView(1)); + EXPECT_EQ(n * n * (n + 1) / 2, hostView(2)); + EXPECT_EQ(n * n, hostView(3)); + } + } + + void test_team_vector_range_only_builtin(const int n) { + auto policy = Kokkos::TeamPolicy(1, Kokkos::AUTO); + using team_member_type = decltype(policy)::member_type; + + auto teamView = Kokkos::View("view"); + + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(team_member_type const& team) { + auto teamVectorRange = Kokkos::TeamVectorRange(team, n); + int teamResult0, teamResult1, teamResult2, teamResult3; + + Kokkos::parallel_reduce( + teamVectorRange, + [=](int const& i, int& localVal0, int& localVal1, int& localVal2, + int& localVal3) { + localVal0 += i + 1; + localVal1 *= n; + localVal2 = (localVal2 > (i + 1)) ? (i + 1) : localVal2; + localVal3 = (localVal3 < (i + 1)) ? (i + 1) : localVal3; + }, + Kokkos::Sum(teamResult0), Kokkos::Prod(teamResult1), + Kokkos::Min(teamResult2), Kokkos::Max(teamResult3)); + + Kokkos::single(Kokkos::PerTeam(team), [=]() { + teamView(0) = teamResult0; + teamView(1) = teamResult1; + teamView(2) = teamResult2; + teamView(3) = teamResult3; + }); + }); + + auto hostView = Kokkos::create_mirror_view_and_copy( + Kokkos::DefaultHostExecutionSpace(), teamView); + + if (n == 0) { + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(0)); + EXPECT_EQ(Kokkos::reduction_identity::prod(), hostView(1)); + EXPECT_EQ(Kokkos::reduction_identity::min(), hostView(2)); + EXPECT_EQ(Kokkos::reduction_identity::max(), hostView(3)); + } else { + EXPECT_EQ((n * (n + 1) / 2), hostView(0)); + EXPECT_EQ(std::pow(n, n), hostView(1)); + EXPECT_EQ(1, hostView(2)); + EXPECT_EQ(n, hostView(3)); + } + } + + void test_team_vector_range_combined_reducers(const int n) { + auto policy = Kokkos::TeamPolicy(1, Kokkos::AUTO); + using team_member_type = decltype(policy)::member_type; + + auto teamView = Kokkos::View("view"); + + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(team_member_type const& team) { + auto teamVectorRange = Kokkos::TeamVectorRange(team, n); + int teamResult0, teamResult1, teamResult2, teamResult3; + + Kokkos::parallel_reduce( + teamVectorRange, + [=](int const& i, int& localVal0, int& localVal1, int& localVal2, + int& localVal3) { + localVal0 += i + 1; + localVal1 += i + 1; + localVal2 = (localVal2 < (i + 1)) ? (i + 1) : localVal2; + localVal3 += n; + }, + teamResult0, Kokkos::Sum(teamResult1), + Kokkos::Max(teamResult2), teamResult3); + + Kokkos::single(Kokkos::PerTeam(team), [=]() { + teamView(0) = teamResult0; + teamView(1) = teamResult1; + teamView(2) = teamResult2; + teamView(3) = teamResult3; + }); + }); + + auto hostView = Kokkos::create_mirror_view_and_copy( + Kokkos::DefaultHostExecutionSpace(), teamView); + + if (n == 0) { + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(0)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(1)); + EXPECT_EQ(Kokkos::reduction_identity::max(), hostView(2)); + EXPECT_EQ(Kokkos::reduction_identity::sum(), hostView(3)); + } else { + EXPECT_EQ((n * (n + 1) / 2), hostView(0)); + EXPECT_EQ((n * (n + 1) / 2), hostView(1)); + EXPECT_EQ(n, hostView(2)); + EXPECT_EQ(n * n, hostView(3)); + } + } +}; + +TEST(TEST_CATEGORY, team_thread_range_combined_reducers) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + if constexpr (std::is_same_v) + GTEST_SKIP() << "team_reduce with a generic reducer is not implemented for " + << TEST_EXECSPACE::name(); + +#elif defined(KOKKOS_ENABLE_OPENACC) + if constexpr (std::is_same_v) + GTEST_SKIP() << "team_reduce with a generic reducer is not implemented for " + << TEST_EXECSPACE::name(); +#endif + + TeamTeamCombinedReducer tester; + tester.test_team_thread_range_only_scalars(5); + tester.test_team_thread_range_only_builtin(7); + tester.test_team_thread_range_combined_reducers(0); + tester.test_team_thread_range_combined_reducers(9); +} + +TEST(TEST_CATEGORY, thread_vector_range_combined_reducers) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + if constexpr (std::is_same_v) + GTEST_SKIP() << "team_reduce with a generic reducer is not implemented for " + << TEST_EXECSPACE::name(); + +#elif defined(KOKKOS_ENABLE_OPENACC) + if constexpr (std::is_same_v) + GTEST_SKIP() << "team_reduce with a generic reducer is not implemented for " + << TEST_EXECSPACE::name(); +#endif + + TeamTeamCombinedReducer tester; + tester.test_thread_vector_range_only_scalars(5); + tester.test_thread_vector_range_only_builtin(7); + tester.test_thread_vector_range_combined_reducers(0); + tester.test_thread_vector_range_combined_reducers(9); +} + +TEST(TEST_CATEGORY, team_vector_range_combined_reducers) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET + if constexpr (std::is_same_v) + GTEST_SKIP() << "team_reduce with a generic reducer is not implemented for " + << TEST_EXECSPACE::name(); +#endif + +#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC + if constexpr (std::is_same_v) + GTEST_SKIP() << "team_reduce with a generic reducer is not implemented for " + << TEST_EXECSPACE::name(); +#endif + + TeamTeamCombinedReducer tester; + tester.test_team_vector_range_only_scalars(5); + tester.test_team_vector_range_only_builtin(7); + tester.test_team_vector_range_combined_reducers(0); + tester.test_team_vector_range_combined_reducers(9); +} + +#endif + +} // namespace From ea134de4816d0dd0cc052325ffe327c93f755c46 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 5 May 2023 12:05:35 -0600 Subject: [PATCH 418/496] Work around NVHPC issue with enum types --- .../TestJoinBackwardCompatibility.hpp | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/core/unit_test/TestJoinBackwardCompatibility.hpp b/core/unit_test/TestJoinBackwardCompatibility.hpp index 60da9e0713..865b71d03f 100644 --- a/core/unit_test/TestJoinBackwardCompatibility.hpp +++ b/core/unit_test/TestJoinBackwardCompatibility.hpp @@ -21,13 +21,16 @@ // unimplemented reduction features namespace { +// FIXME_NVHPC errors out when using enums here +// NVC++-F-0000-Internal compiler error. process_acc_put_dinit: unexpected +// datatype 5339 +#ifndef KOKKOS_COMPILER_NVHPC enum MyErrorCode { no_error = 0b000, error_operator_plus_equal = 0b001, error_operator_plus_equal_volatile = 0b010, error_join_volatile = 0b100, expected_join_volatile = 0b1000 - }; KOKKOS_FUNCTION constexpr MyErrorCode operator|(MyErrorCode lhs, @@ -36,6 +39,17 @@ KOKKOS_FUNCTION constexpr MyErrorCode operator|(MyErrorCode lhs, static_cast(rhs)); } +#else + +using MyErrorCode = unsigned; +constexpr MyErrorCode no_error = 0b000; +constexpr MyErrorCode error_operator_plus_equal = 0b001; +constexpr MyErrorCode error_operator_plus_equal_volatile = 0b010; +constexpr MyErrorCode error_join_volatile = 0b100; +constexpr MyErrorCode expected_join_volatile = 0b1000; + +#endif + static_assert((no_error | error_operator_plus_equal_volatile) == error_operator_plus_equal_volatile, ""); @@ -130,10 +144,6 @@ void test_join_backward_compatibility() { } TEST(TEST_CATEGORY, join_backward_compatibility) { -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC - GTEST_SKIP() << "FIXME wrong result"; -#endif test_join_backward_compatibility(); } From 60b982ad090d7ef6686c61181db0cae6983ca95b Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 5 May 2023 12:06:29 -0600 Subject: [PATCH 419/496] Work around NVHPC 23.x issues Works around long double usage in device code being treated stricter, and now needing a workaround similar to NVCC for implementation of traits and constants. --- core/unit_test/TestMathematicalConstants.hpp | 2 +- core/unit_test/TestNumericTraits.hpp | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestMathematicalConstants.hpp b/core/unit_test/TestMathematicalConstants.hpp index f52bfeaff7..b9b350c9b7 100644 --- a/core/unit_test/TestMathematicalConstants.hpp +++ b/core/unit_test/TestMathematicalConstants.hpp @@ -63,7 +63,7 @@ struct TestMathematicalConstants { KOKKOS_FUNCTION void use_on_device() const { #if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_OPENMPTARGET) || \ - defined(KOKKOS_ENABLE_OPENACC) + defined(KOKKOS_ENABLE_OPENACC) || defined(KOKKOS_COMPILER_NVHPC) take_by_value(Trait::value); #else (void)take_address_of(Trait::value); diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index 94e67f73da..fc6c6bba6c 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -40,7 +40,14 @@ struct extrema { DEFINE_EXTREMA(float, -FLT_MAX, FLT_MAX); DEFINE_EXTREMA(double, -DBL_MAX, DBL_MAX); + +// FIXME_NVHPC: with 23.3 using long double in KOKKOS_FUNCTION is hard error +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_COMPILER_NVHPC) DEFINE_EXTREMA(long double, -LDBL_MAX, LDBL_MAX); +#else + static long double min(long double) { return -LDBL_MAX; } + static long double max(long double) { return LDBL_MAX; } +#endif #undef DEFINE_EXTREMA }; @@ -163,7 +170,8 @@ struct TestNumericTraits { } KOKKOS_FUNCTION void use_on_device() const { -#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_OPENMPTARGET) +#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_COMPILER_NVHPC) || \ + defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_OPENACC) take_by_value(trait::value); #else (void)take_address_of(trait::value); From 5fa72b590e4ebdf3b93898934c465c8d2f972ef6 Mon Sep 17 00:00:00 2001 From: "Roscoe A. Bartlett" Date: Fri, 5 May 2023 16:59:09 -0400 Subject: [PATCH 420/496] Kokkos: Remove TriBITS Kokkos subpackages (trilinos/Trilinos#11545) (#6104) * Kokkos: Remove TriBITS subpackages (#11545) * Removed the listing of subpackages from kokkos/cmake/Dependencies.cmake * Remove the now-unused files kokkos/[core,containers,algorithms,simd]/cmake/Dependencies.cmake * Removed TriBITS macros for a package with subpackages and replace with those for a package with no subpackages. Also, removed all subpackage macros. * Changed kokkos_process_subpackage() to just call add_subdirectory(). * Added prefix 'Core' to several tests in kokkos/Core/unit_tests/CMakeLists.txt now that prefix is 'Kokkos_' * Added prefix 'Containers' to several tests in kokkos/containers/unit_tests/CMakeLists.txt and kokkos/containers/performance_tests/CMakeLists.txt now that prefix is 'Kokkos_' * Change name of the kokkos/containers/performance_tests/CMakeLists.txt file test 'PerformanceTest_XXX' to 'ContainersPerformanceTest_XXX'. * Added prefix 'Algorithms' to several tests in kokkos/algorithms/unit_tests/CMakeLists.txt now that prefix is 'Kokkos_' * Removed the usage of tribits_configure_file() and wrapper kokkos_configure_file() and just call configure_file(). The location of PACKAGE_SORUCE_DIR changed so the calls to tribits_configure_file() no longer worked. (Also, these X_config.h.in files were not using any of the TriBITS-supported features that needed the calling of tribits_configure_file() so there was no reason to not just call raw configure_file().) SQUASH AGINST: Kokkos: Remove TriBITS subpackages (#11545) * Fix native build of Kokkos after removing subpackages (trilinos/Trilinos#11545) This restores the building of the raw CMake build of Kokkos after the refactoring to remove TriBITS subpackages. * Kokkos: Remove last of subpackage stuff, fix for tests enable (trilinos/Trilinos#11545) This gives a full passing build and tests with the Trilinos PR GenConfig clang-11.0.1 build configuration. * Fixup update target name in python test script that gets configured --------- Co-authored-by: Damien L-G --- CMakeLists.txt | 11 +- algorithms/CMakeLists.txt | 8 -- algorithms/cmake/Dependencies.cmake | 5 - algorithms/src/CMakeLists.txt | 7 +- algorithms/unit_tests/CMakeLists.txt | 6 +- cmake/Dependencies.cmake | 12 +- cmake/fake_tribits.cmake | 13 --- cmake/kokkos_tribits.cmake | 51 +-------- containers/CMakeLists.txt | 6 - containers/cmake/Dependencies.cmake | 5 - containers/performance_tests/CMakeLists.txt | 2 +- containers/src/CMakeLists.txt | 5 +- containers/unit_tests/CMakeLists.txt | 4 +- core/CMakeLists.txt | 6 - core/cmake/Dependencies.cmake | 6 - core/unit_test/CMakeLists.txt | 120 ++++++++++---------- core/unit_test/TestDeviceAndThreads.py | 4 +- example/CMakeLists.txt | 8 -- simd/CMakeLists.txt | 5 - simd/cmake/Dependencies.cmake | 5 - 20 files changed, 88 insertions(+), 201 deletions(-) delete mode 100644 algorithms/cmake/Dependencies.cmake delete mode 100644 containers/cmake/Dependencies.cmake delete mode 100644 core/cmake/Dependencies.cmake delete mode 100644 simd/cmake/Dependencies.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index e1b77a2c09..01ba9f8a18 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,8 +5,8 @@ if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" ) message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." ) endif() -if (COMMAND TRIBITS_PACKAGE_DECL) - TRIBITS_PACKAGE_DECL(Kokkos) +if (COMMAND TRIBITS_PACKAGE) + TRIBITS_PACKAGE(Kokkos) endif() # We want to determine if options are given with the wrong case @@ -37,6 +37,8 @@ IF(COMMAND TRIBITS_PACKAGE_DECL) SET(KOKKOS_HAS_TRILINOS ON) ELSE() SET(KOKKOS_HAS_TRILINOS OFF) + SET(PACKAGE_NAME Kokkos) + SET(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") ENDIF() # Is this build a subdirectory of another project GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY) @@ -299,10 +301,6 @@ IF (KOKKOS_HAS_TRILINOS) $<$:${KOKKOS_ALL_COMPILE_OPTIONS}>) ENDIF() -if (NOT COMMAND TRIBITS_PACKAGE_DECL) - KOKKOS_PACKAGE_DECL() -endif() - #------------------------------------------------------------------------------ # @@ -316,7 +314,6 @@ KOKKOS_PROCESS_SUBPACKAGES() # E) If Kokkos itself is enabled, process the Kokkos package # -KOKKOS_PACKAGE_DEF() KOKKOS_EXCLUDE_AUTOTOOLS_FILES() KOKKOS_PACKAGE_POSTPROCESS() KOKKOS_CONFIGURE_CORE() diff --git a/algorithms/CMakeLists.txt b/algorithms/CMakeLists.txt index f32363dc9a..ab557ab66a 100644 --- a/algorithms/CMakeLists.txt +++ b/algorithms/CMakeLists.txt @@ -1,7 +1,3 @@ - - -KOKKOS_SUBPACKAGE(Algorithms) - IF (NOT Kokkos_INSTALL_TESTING) ADD_SUBDIRECTORY(src) ENDIF() @@ -9,7 +5,3 @@ ENDIF() IF(NOT ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) ENDIF() - -KOKKOS_SUBPACKAGE_POSTPROCESS() - - diff --git a/algorithms/cmake/Dependencies.cmake b/algorithms/cmake/Dependencies.cmake deleted file mode 100644 index c36b62523f..0000000000 --- a/algorithms/cmake/Dependencies.cmake +++ /dev/null @@ -1,5 +0,0 @@ -TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC - TEST_OPTIONAL_TPLS CUSPARSE - ) diff --git a/algorithms/src/CMakeLists.txt b/algorithms/src/CMakeLists.txt index 606d83d18b..8f3066b6de 100644 --- a/algorithms/src/CMakeLists.txt +++ b/algorithms/src/CMakeLists.txt @@ -1,5 +1,6 @@ - -KOKKOS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h) +CONFIGURE_FILE( + ${PACKAGE_SOURCE_DIR}/algorithms/cmake/KokkosAlgorithms_config.h.in + ${CMAKE_CURRENT_BINARY_DIR}/KokkosAlgorithms_config.h ) #I have to leave these here for tribits KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) @@ -9,7 +10,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) FILE(GLOB ALGO_HEADERS *.hpp) FILE(GLOB ALGO_SOURCES *.cpp) -LIST(APPEND ALGO_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h) +LIST(APPEND ALGO_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/KokkosAlgorithms_config.h) APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) diff --git a/algorithms/unit_tests/CMakeLists.txt b/algorithms/unit_tests/CMakeLists.txt index 0fe9c2006e..2d81e70179 100644 --- a/algorithms/unit_tests/CMakeLists.txt +++ b/algorithms/unit_tests/CMakeLists.txt @@ -164,7 +164,7 @@ endif() # when compiling for Intel's Xe-HP GPUs. if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_RandomAndSort + AlgorithmsUnitTest_RandomAndSort SOURCES UnitTestMain.cpp ${SOURCES_A} @@ -173,7 +173,7 @@ endif() foreach(ID A;B;C;D;E) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_StdSet_${ID} + AlgorithmsUnitTest_StdSet_${ID} SOURCES UnitTestMain.cpp ${STDALGO_SOURCES_${ID}} @@ -184,7 +184,7 @@ endforeach() # when compiling for Intel's Xe-HP GPUs. if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) KOKKOS_ADD_EXECUTABLE( - UnitTest_StdAlgoCompileOnly + AlgorithmsUnitTest_StdAlgoCompileOnly SOURCES TestStdAlgorithmsCompileOnly.cpp ) endif() diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 23b473ce24..611c089b2e 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,10 +1,6 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS - #SubPackageName Directory Class Req/Opt - # - # New Kokkos subpackages: - Core core PS REQUIRED - Containers containers PS OPTIONAL - Algorithms algorithms PS OPTIONAL - Simd simd PT OPTIONAL + LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib + TEST_OPTIONAL_TPLS CUSPARSE ) + +TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) diff --git a/cmake/fake_tribits.cmake b/cmake/fake_tribits.cmake index 71e85e915c..39822b8aad 100644 --- a/cmake/fake_tribits.cmake +++ b/cmake/fake_tribits.cmake @@ -57,19 +57,6 @@ MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE) ENDMACRO() endif() - -FUNCTION(KOKKOS_CONFIGURE_FILE PACKAGE_NAME_CONFIG_FILE) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME_CONFIG_FILE}) - else() - # Configure the file - CONFIGURE_FILE( - ${PACKAGE_SOURCE_DIR}/cmake/${PACKAGE_NAME_CONFIG_FILE}.in - ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_CONFIG_FILE} - ) - endif() -ENDFUNCTION() - MACRO(ADD_INTERFACE_LIBRARY LIB_NAME) FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) diff --git a/cmake/kokkos_tribits.cmake b/cmake/kokkos_tribits.cmake index 0f39551423..01ea7efd8a 100644 --- a/cmake/kokkos_tribits.cmake +++ b/cmake/kokkos_tribits.cmake @@ -44,53 +44,12 @@ IF (KOKKOS_HAS_TRILINOS) ENDIF() ENDIF() -MACRO(KOKKOS_SUBPACKAGE NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_SUBPACKAGE(${NAME}) - else() - SET(PACKAGE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) - SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME}) - SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME}) - STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC) - SET(${PACKAGE_NAME}_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) - #ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME}) - #GLOBAL_SET(${PACKAGE_NAME}_LIBS "") - endif() -ENDMACRO() - -MACRO(KOKKOS_SUBPACKAGE_POSTPROCESS) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_SUBPACKAGE_POSTPROCESS() - endif() -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_DECL) - - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_DECL(Kokkos) - else() - SET(PACKAGE_NAME Kokkos) - SET(${PACKAGE_NAME}_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) - STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC) - endif() - - #SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps") - #FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake") - #FOREACH(TPL_FILE ${TPLS_FILES}) - # TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE}) - #ENDFOREACH() - -ENDMACRO() - - MACRO(KOKKOS_PROCESS_SUBPACKAGES) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PROCESS_SUBPACKAGES() - else() - ADD_SUBDIRECTORY(core) - ADD_SUBDIRECTORY(containers) - ADD_SUBDIRECTORY(algorithms) - ADD_SUBDIRECTORY(simd) + ADD_SUBDIRECTORY(core) + ADD_SUBDIRECTORY(containers) + ADD_SUBDIRECTORY(algorithms) + ADD_SUBDIRECTORY(simd) + if (NOT KOKKOS_HAS_TRILINOS) ADD_SUBDIRECTORY(example) endif() ENDMACRO() diff --git a/containers/CMakeLists.txt b/containers/CMakeLists.txt index b450c27209..0857d7007b 100644 --- a/containers/CMakeLists.txt +++ b/containers/CMakeLists.txt @@ -1,7 +1,3 @@ - - -KOKKOS_SUBPACKAGE(Containers) - IF (NOT Kokkos_INSTALL_TESTING) ADD_SUBDIRECTORY(src) ENDIF() @@ -11,5 +7,3 @@ IF(NOT KOKKOS_ENABLE_OPENACC) KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) KOKKOS_ADD_TEST_DIRECTORIES(performance_tests) ENDIF() - -KOKKOS_SUBPACKAGE_POSTPROCESS() diff --git a/containers/cmake/Dependencies.cmake b/containers/cmake/Dependencies.cmake deleted file mode 100644 index 1d71d8af34..0000000000 --- a/containers/cmake/Dependencies.cmake +++ /dev/null @@ -1,5 +0,0 @@ -TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_REQUIRED_PACKAGES KokkosCore - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC - TEST_OPTIONAL_TPLS CUSPARSE - ) diff --git a/containers/performance_tests/CMakeLists.txt b/containers/performance_tests/CMakeLists.txt index 4f1eeacdad..e325e45e85 100644 --- a/containers/performance_tests/CMakeLists.txt +++ b/containers/performance_tests/CMakeLists.txt @@ -16,7 +16,7 @@ foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_${Tag} + ContainersPerformanceTest_${Tag} SOURCES ${SOURCES} ) endif() diff --git a/containers/src/CMakeLists.txt b/containers/src/CMakeLists.txt index cdbc6527fd..3e18a6d129 100644 --- a/containers/src/CMakeLists.txt +++ b/containers/src/CMakeLists.txt @@ -1,5 +1,6 @@ - -KOKKOS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h) +CONFIGURE_FILE( + ${PACKAGE_SOURCE_DIR}/containers/cmake/KokkosContainers_config.h.in + ${CMAKE_CURRENT_BINARY_DIR}/KokkosContainers_config.h ) #need these here for now KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) diff --git a/containers/unit_tests/CMakeLists.txt b/containers/unit_tests/CMakeLists.txt index 261d9dcd42..d9f4d0d384 100644 --- a/containers/unit_tests/CMakeLists.txt +++ b/containers/unit_tests/CMakeLists.txt @@ -43,7 +43,7 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) endif() - KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources}) + KOKKOS_ADD_EXECUTABLE_AND_TEST(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() @@ -51,7 +51,7 @@ SET(COMPILE_ONLY_SOURCES TestCreateMirror.cpp ) KOKKOS_ADD_EXECUTABLE( - TestCompileOnly + ContainersTestCompileOnly SOURCES TestCompileMain.cpp ${COMPILE_ONLY_SOURCES} diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index b78eb05e26..0917928001 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -1,7 +1,3 @@ - - -KOKKOS_SUBPACKAGE(Core) - IF (NOT Kokkos_INSTALL_TESTING) ADD_SUBDIRECTORY(src) ENDIF() @@ -24,5 +20,3 @@ ENDFUNCTION() KOKKOS_ADD_TEST_DIRECTORIES(unit_test) KOKKOS_ADD_BENCHMARK_DIRECTORY(perf_test) - -KOKKOS_SUBPACKAGE_POSTPROCESS() diff --git a/core/cmake/Dependencies.cmake b/core/cmake/Dependencies.cmake deleted file mode 100644 index 611c089b2e..0000000000 --- a/core/cmake/Dependencies.cmake +++ /dev/null @@ -1,6 +0,0 @@ -TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib - TEST_OPTIONAL_TPLS CUSPARSE - ) - -TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 94a3ac285f..0b48eba9ea 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -3,7 +3,7 @@ # IF(NOT GTest_FOUND) # fallback to internal gtest - SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest) + SET(GTEST_SOURCE_DIR ${Kokkos_SOURCE_DIR}/tpls/gtest) #need here for tribits KOKKOS_INCLUDE_DIRECTORIES(${GTEST_SOURCE_DIR}) @@ -95,7 +95,7 @@ IF(KOKKOS_HAS_TRILINOS) LIST(REMOVE_ITEM COMPILE_ONLY_SOURCES TestInterOp.cpp) ENDIF() KOKKOS_ADD_EXECUTABLE( - TestCompileOnly + CoreTestCompileOnly SOURCES TestCompileMain.cpp ${COMPILE_ONLY_SOURCES} @@ -554,20 +554,20 @@ endif() if(Kokkos_ENABLE_SERIAL) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Serial1 + CoreUnitTest_Serial1 SOURCES UnitTestMainInit.cpp ${Serial_SOURCES1} serial/TestSerial_Task.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Serial2 + CoreUnitTest_Serial2 SOURCES UnitTestMainInit.cpp ${Serial_SOURCES2} ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SerialGraph + CoreUnitTest_SerialGraph SOURCES UnitTestMainInit.cpp serial/TestSerial_Graph.cpp @@ -576,7 +576,7 @@ endif() if(Kokkos_ENABLE_THREADS) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Threads + CoreUnitTest_Threads SOURCES ${Threads_SOURCES} UnitTestMainInit.cpp ) @@ -588,20 +588,20 @@ if (Kokkos_ENABLE_OPENMP) openmp/TestOpenMP_PartitionMaster.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_OpenMP + CoreUnitTest_OpenMP SOURCES UnitTestMainInit.cpp ${OpenMP_SOURCES} ${OpenMP_EXTRA_SOURCES} ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_OpenMPInterOp + CoreUnitTest_OpenMPInterOp SOURCES UnitTestMain.cpp openmp/TestOpenMP_InterOp.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_OpenMPGraph + CoreUnitTest_OpenMPGraph SOURCES UnitTestMainInit.cpp openmp/TestOpenMP_Graph.cpp @@ -610,20 +610,20 @@ endif() if(Kokkos_ENABLE_HPX) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HPX + CoreUnitTest_HPX SOURCES UnitTestMainInit.cpp ${HPX_SOURCES} hpx/TestHPX_Task.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HPXInterOp + CoreUnitTest_HPXInterOp SOURCES UnitTestMain.cpp hpx/TestHPX_InterOp.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HPX_IndependentInstances + CoreUnitTest_HPX_IndependentInstances SOURCES UnitTestMainInit.cpp hpx/TestHPX_IndependentInstances.cpp @@ -636,7 +636,7 @@ endif() if(Kokkos_ENABLE_OPENMPTARGET) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_OpenMPTarget + CoreUnitTest_OpenMPTarget SOURCES UnitTestMainInit.cpp ${OpenMPTarget_SOURCES} @@ -645,7 +645,7 @@ endif() if(Kokkos_ENABLE_OPENACC) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_OpenACC + CoreUnitTest_OpenACC SOURCES UnitTestMainInit.cpp ${OpenACC_SOURCES} @@ -654,7 +654,7 @@ endif() if(Kokkos_ENABLE_CUDA) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Cuda1 + CoreUnitTest_Cuda1 SOURCES UnitTestMainInit.cpp ${Cuda_SOURCES1} @@ -662,14 +662,14 @@ if(Kokkos_ENABLE_CUDA) ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Cuda2 + CoreUnitTest_Cuda2 SOURCES UnitTestMainInit.cpp ${Cuda_SOURCES2} ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Cuda3 + CoreUnitTest_Cuda3 SOURCES UnitTestMainInit.cpp cuda/TestCuda_Task.cpp @@ -680,7 +680,7 @@ if(Kokkos_ENABLE_CUDA) ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_CudaTimingBased + CoreUnitTest_CudaTimingBased SOURCES UnitTestMainInit.cpp cuda/TestCuda_DebugSerialExecution.cpp @@ -688,19 +688,19 @@ if(Kokkos_ENABLE_CUDA) ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_CudaInterOpInit + CoreUnitTest_CudaInterOpInit SOURCES UnitTestMain.cpp cuda/TestCuda_InterOp_Init.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_CudaInterOpStreams + CoreUnitTest_CudaInterOpStreams SOURCES UnitTestMain.cpp cuda/TestCuda_InterOp_Streams.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_CudaGraph + CoreUnitTest_CudaGraph SOURCES UnitTestMainInit.cpp cuda/TestCuda_Graph.cpp @@ -709,7 +709,7 @@ endif() if(Kokkos_ENABLE_HIP) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HIP + CoreUnitTest_HIP SOURCES UnitTestMainInit.cpp ${HIP_SOURCES} @@ -721,13 +721,13 @@ if(Kokkos_ENABLE_HIP) hip/TestHIP_BlocksizeDeduction.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HIPInterOpInit + CoreUnitTest_HIPInterOpInit SOURCES UnitTestMain.cpp hip/TestHIP_InterOp_Init.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HIPInterOpStreams + CoreUnitTest_HIPInterOpStreams SOURCES UnitTestMain.cpp hip/TestHIP_InterOp_Streams.cpp @@ -746,49 +746,49 @@ if(Kokkos_ENABLE_SYCL) ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SYCL1A + CoreUnitTest_SYCL1A SOURCES UnitTestMainInit.cpp ${SYCL_SOURCES1A} ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SYCL1B + CoreUnitTest_SYCL1B SOURCES UnitTestMainInit.cpp ${SYCL_SOURCES1B} ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SYCL2A + CoreUnitTest_SYCL2A SOURCES UnitTestMainInit.cpp ${SYCL_SOURCES2A} ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SYCL2B + CoreUnitTest_SYCL2B SOURCES UnitTestMainInit.cpp ${SYCL_SOURCES2B} ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SYCL2C + CoreUnitTest_SYCL2C SOURCES UnitTestMainInit.cpp ${SYCL_SOURCES2C} ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SYCL2D + CoreUnitTest_SYCL2D SOURCES UnitTestMainInit.cpp ${SYCL_SOURCES2D} ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SYCL3 + CoreUnitTest_SYCL3 SOURCES UnitTestMainInit.cpp # FIXME_SYCL @@ -799,19 +799,19 @@ if(Kokkos_ENABLE_SYCL) ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SYCLInterOpInit + CoreUnitTest_SYCLInterOpInit SOURCES UnitTestMain.cpp sycl/TestSYCL_InterOp_Init.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SYCLInterOpInit_Context + CoreUnitTest_SYCLInterOpInit_Context SOURCES UnitTestMainInit.cpp sycl/TestSYCL_InterOp_Init_Context.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SYCLInterOpStreams + CoreUnitTest_SYCLInterOpStreams SOURCES UnitTestMain.cpp sycl/TestSYCL_InterOp_Streams.cpp @@ -860,19 +860,19 @@ if ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILE endif() KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Default + CoreUnitTest_Default SOURCES ${DEFAULT_DEVICE_SOURCES} ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_LegionInitialization + CoreUnitTest_LegionInitialization SOURCES UnitTestMain.cpp TestLegionInitialization.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_PushFinalizeHook + CoreUnitTest_PushFinalizeHook SOURCES UnitTest_PushFinalizeHook.cpp ) @@ -880,7 +880,7 @@ KOKKOS_ADD_EXECUTABLE_AND_TEST( # This test is intended for development and debugging by putting code # into TestDefaultDeviceDevelop.cpp. By default its empty. KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Develop + CoreUnitTest_Develop SOURCES UnitTestMainInit.cpp default/TestDefaultDeviceDevelop.cpp @@ -895,7 +895,7 @@ KOKKOS_ADD_TEST_EXECUTABLE( push_finalize_hook_terminate SOURCES UnitTest_PushFinalizeHook_terminate.cpp ) -KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate +KOKKOS_ADD_ADVANCED_TEST( CoreUnitTest_PushFinalizeHook_terminate TEST_0 EXEC push_finalize_hook_terminate NUM_MPI_PROCS 1 @@ -905,24 +905,24 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate ) if(KOKKOS_ENABLE_TUNING) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_TuningBuiltins + CoreUnitTest_TuningBuiltins SOURCES tools/TestBuiltinTuners.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_TuningBasics + CoreUnitTest_TuningBasics SOURCES tools/TestTuning.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_CategoricalTuner + CoreUnitTest_CategoricalTuner SOURCES tools/TestCategoricalTuner.cpp ) endif() if((NOT Kokkos_ENABLE_OPENMPTARGET) AND (NOT Kokkos_ENABLE_OPENACC)) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_LogicalSpaces + CoreUnitTest_LogicalSpaces SOURCES tools/TestLogicalSpaces.cpp ) @@ -942,18 +942,18 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate endif() KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_KokkosP + CoreUnitTest_KokkosP SOURCES ${KOKKOSP_SOURCES} ) if(KOKKOS_ENABLE_LIBDL) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_ToolIndependence + CoreUnitTest_ToolIndependence SOURCES tools/TestIndependence.cpp ) TARGET_COMPILE_DEFINITIONS( - KokkosCore_UnitTest_ToolIndependence PUBLIC + Kokkos_CoreUnitTest_ToolIndependence PUBLIC KOKKOS_TOOLS_INDEPENDENT_BUILD ) KOKKOS_ADD_TEST_LIBRARY( @@ -983,7 +983,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate TOOL kokkosprinter-tool ARGS --kokkos-tools-help PASS_REGULAR_EXPRESSION - "kokkosp_init_library::kokkosp_print_help:KokkosCore_ProfilingAllCalls::kokkosp_finalize_library::") + "kokkosp_init_library::kokkosp_print_help:Kokkos_ProfilingAllCalls::kokkosp_finalize_library::") # check help works via direct library specification KOKKOS_ADD_TEST( @@ -993,7 +993,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate ARGS --kokkos-tools-help --kokkos-tools-libs=$ PASS_REGULAR_EXPRESSION - "kokkosp_init_library::kokkosp_print_help:KokkosCore_ProfilingAllCalls::kokkosp_finalize_library::") + "kokkosp_init_library::kokkosp_print_help:Kokkos_ProfilingAllCalls::kokkosp_finalize_library::") KOKKOS_ADD_TEST( SKIP_TRIBITS @@ -1001,7 +1001,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate EXE ProfilingAllCalls TOOL kokkosprinter-tool ARGS --kokkos-tools-args="-c test delimit" - PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::" + PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:Kokkos_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::" ) # Above will test that leading/trailing quotes are stripped bc ctest cmd args is: @@ -1018,7 +1018,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate EXE ProfilingAllCalls ARGS [=[--kokkos-tools-args=-c test delimit]=] --kokkos-tools-libs=$ - PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::" + PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:Kokkos_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::" ) endif() #KOKKOS_ENABLE_LIBDL if(NOT KOKKOS_HAS_TRILINOS) @@ -1036,7 +1036,7 @@ KOKKOS_ADD_TEST_EXECUTABLE( # to work correctly with shared libraries KOKKOS_SET_EXE_PROPERTY(StackTraceTestExec ENABLE_EXPORTS ON) -KOKKOS_ADD_TEST( NAME UnitTest_StackTraceTest +KOKKOS_ADD_TEST( NAME CoreUnitTest_StackTraceTest EXE StackTraceTestExec FAIL_REGULAR_EXPRESSION "FAILED" ) @@ -1045,7 +1045,7 @@ endif() if(Kokkos_ENABLE_DEPRECATED_CODE_3) foreach(INITTESTS_NUM RANGE 1 18) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_DefaultInit_${INITTESTS_NUM} + CoreUnitTest_DefaultInit_${INITTESTS_NUM} SOURCES UnitTestMain.cpp default/TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp ) endforeach(INITTESTS_NUM) @@ -1053,7 +1053,7 @@ endif() if (KOKKOS_ENABLE_HWLOC) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HWLOC + CoreUnitTest_HWLOC SOURCES UnitTestMain.cpp TestHWLOC.cpp ) endif() @@ -1109,12 +1109,12 @@ FOREACH (DEVICE ${KOKKOS_ENABLED_DEVICES}) ENDFOREACH() KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_CTestDevice + CoreUnitTest_CTestDevice SOURCES UnitTestMain.cpp TestCTestDevice.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_CMakePassCmdLineArgs + CoreUnitTest_CMakePassCmdLineArgs SOURCES UnitTest_CMakePassCmdLineArgs.cpp ARGS "one 2 THREE" ) @@ -1122,22 +1122,22 @@ KOKKOS_ADD_EXECUTABLE_AND_TEST( # This test is not properly set up to run within Trilinos if (NOT KOKKOS_HAS_TRILINOS) SET_SOURCE_FILES_PROPERTIES(UnitTest_DeviceAndThreads.cpp PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - add_executable(KokkosCore_UnitTest_DeviceAndThreads UnitTest_DeviceAndThreads.cpp) - target_link_libraries(KokkosCore_UnitTest_DeviceAndThreads Kokkos::kokkoscore) + add_executable(Kokkos_CoreUnitTest_DeviceAndThreads UnitTest_DeviceAndThreads.cpp) + target_link_libraries(Kokkos_CoreUnitTest_DeviceAndThreads Kokkos::kokkoscore) find_package(Python3 COMPONENTS Interpreter) if(Python3_Interpreter_FOUND AND Python3_VERSION VERSION_GREATER_EQUAL 3.7) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20) set(USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED USE_SOURCE_PERMISSIONS) endif() file(GENERATE - OUTPUT $/TestDeviceAndThreads.py + OUTPUT $/TestDeviceAndThreads.py INPUT TestDeviceAndThreads.py ${USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED} ) if(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME_OPENMPTARGET does not select the right device add_test( - NAME KokkosCore_UnitTest_DeviceAndThreads - COMMAND ${Python3_EXECUTABLE} -m unittest -v $/TestDeviceAndThreads.py + NAME Kokkos_CoreUnitTest_DeviceAndThreads + COMMAND ${Python3_EXECUTABLE} -m unittest -v $/TestDeviceAndThreads.py ) endif() endif() diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index fd70e3ff68..1d3ff8eea7 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -18,8 +18,8 @@ import unittest import subprocess -PREFIX = "$" -EXECUTABLE = "$" +PREFIX = "$" +EXECUTABLE = "$" COMMAND = "/".join([PREFIX, EXECUTABLE]) diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 7ecaec0f24..3920dc9a27 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -1,10 +1,2 @@ - - -# Subpackage name must match what appears in kokkos/cmake/Dependencies.cmake -# -KOKKOS_SUBPACKAGE(Example) - KOKKOS_ADD_EXAMPLE_DIRECTORIES(query_device) KOKKOS_ADD_EXAMPLE_DIRECTORIES(tutorial) - -KOKKOS_SUBPACKAGE_POSTPROCESS() diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt index 83557e61e6..59e09b85ac 100644 --- a/simd/CMakeLists.txt +++ b/simd/CMakeLists.txt @@ -1,10 +1,5 @@ - -KOKKOS_SUBPACKAGE(Simd) - IF (NOT Kokkos_INSTALL_TESTING) ADD_SUBDIRECTORY(src) ENDIF() KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) - -KOKKOS_SUBPACKAGE_POSTPROCESS() diff --git a/simd/cmake/Dependencies.cmake b/simd/cmake/Dependencies.cmake deleted file mode 100644 index 1d71d8af34..0000000000 --- a/simd/cmake/Dependencies.cmake +++ /dev/null @@ -1,5 +0,0 @@ -TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_REQUIRED_PACKAGES KokkosCore - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC - TEST_OPTIONAL_TPLS CUSPARSE - ) From 7ef7d02a30f4b15ead9150bd9ac1d8ff5a2ed527 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 5 May 2023 17:10:25 -0400 Subject: [PATCH 421/496] Drop pointless Kokkos{Algorithms,Containers}_config.h files They are never included, not copied over to the install directory. They also have no counterpart in the hand-rolled makefiles. --- algorithms/cmake/KokkosAlgorithms_config.h.in | 4 ---- algorithms/src/CMakeLists.txt | 5 ----- containers/cmake/KokkosContainers_config.h.in | 4 ---- containers/src/CMakeLists.txt | 4 ---- 4 files changed, 17 deletions(-) delete mode 100644 algorithms/cmake/KokkosAlgorithms_config.h.in delete mode 100644 containers/cmake/KokkosContainers_config.h.in diff --git a/algorithms/cmake/KokkosAlgorithms_config.h.in b/algorithms/cmake/KokkosAlgorithms_config.h.in deleted file mode 100644 index 67334b70f3..0000000000 --- a/algorithms/cmake/KokkosAlgorithms_config.h.in +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef KOKKOS_ALGORITHMS_CONFIG_H -#define KOKKOS_ALGORITHMS_CONFIG_H - -#endif diff --git a/algorithms/src/CMakeLists.txt b/algorithms/src/CMakeLists.txt index 8f3066b6de..1695778947 100644 --- a/algorithms/src/CMakeLists.txt +++ b/algorithms/src/CMakeLists.txt @@ -1,7 +1,3 @@ -CONFIGURE_FILE( - ${PACKAGE_SOURCE_DIR}/algorithms/cmake/KokkosAlgorithms_config.h.in - ${CMAKE_CURRENT_BINARY_DIR}/KokkosAlgorithms_config.h ) - #I have to leave these here for tribits KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) @@ -10,7 +6,6 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) FILE(GLOB ALGO_HEADERS *.hpp) FILE(GLOB ALGO_SOURCES *.cpp) -LIST(APPEND ALGO_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/KokkosAlgorithms_config.h) APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) diff --git a/containers/cmake/KokkosContainers_config.h.in b/containers/cmake/KokkosContainers_config.h.in deleted file mode 100644 index d91fdda1e3..0000000000 --- a/containers/cmake/KokkosContainers_config.h.in +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef KOKKOS_CONTAINERS_CONFIG_H -#define KOKKOS_CONTAINERS_CONFIG_H - -#endif diff --git a/containers/src/CMakeLists.txt b/containers/src/CMakeLists.txt index 3e18a6d129..b7d85ebf11 100644 --- a/containers/src/CMakeLists.txt +++ b/containers/src/CMakeLists.txt @@ -1,7 +1,3 @@ -CONFIGURE_FILE( - ${PACKAGE_SOURCE_DIR}/containers/cmake/KokkosContainers_config.h.in - ${CMAKE_CURRENT_BINARY_DIR}/KokkosContainers_config.h ) - #need these here for now KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) From d7c06c4335c2329c17881f207a4675ea11b8ea6e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 5 May 2023 22:36:14 -0400 Subject: [PATCH 422/496] Revert "Merge pull request #5964 from PhilMiller/cuda-lambda-default" This reverts commit 945281ac05a25dfafff121f9fcc72134ba06115a, reversing changes made to a45cc1eff0e461942a66a13345fe215eee059c2c. --- .jenkins | 4 ++++ Makefile.kokkos | 14 ++++++++------ cmake/KokkosCore_config.h.in | 1 + cmake/kokkos_arch.cmake | 17 +++-------------- cmake/kokkos_enable_options.cmake | 10 +++++++++- containers/unit_tests/TestErrorReporter.hpp | 2 ++ containers/unit_tests/TestOffsetView.hpp | 14 ++++++++++++++ core/perf_test/CMakeLists.txt | 2 ++ core/perf_test/PerfTest_ViewAllocate.cpp | 2 ++ core/perf_test/PerfTest_ViewCopy_Raw.cpp | 2 ++ core/perf_test/PerfTest_ViewFill_Raw.cpp | 2 ++ core/perf_test/PerfTest_ViewResize_Raw.cpp | 2 ++ core/src/Kokkos_Macros.hpp | 2 -- core/src/setup/Kokkos_Setup_Cuda.hpp | 6 ++++++ core/unit_test/TestCompilerMacros.cpp | 6 ++++++ core/unit_test/TestMDRangeReduce.hpp | 2 ++ core/unit_test/TestTeamMDRange.hpp | 6 ++++++ core/unit_test/TestViewMapping_a.hpp | 9 +++++---- generate_makefile.bash | 6 ++++-- .../trilinos-integration/waterman_cuda_env.sh | 2 +- scripts/trilinos-integration/white_cuda_env.sh | 2 +- 21 files changed, 82 insertions(+), 31 deletions(-) diff --git a/.jenkins b/.jenkins index b95ba7ff0b..fcbcf56073 100644 --- a/.jenkins +++ b/.jenkins @@ -84,6 +84,7 @@ pipeline { -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_OPENMP=ON \ .. && \ make -j8 && ctest --verbose''' @@ -312,6 +313,7 @@ pipeline { -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_TUNING=ON \ -DKokkos_ARCH_VOLTA70=ON \ .. && \ @@ -384,6 +386,7 @@ pipeline { -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_OPENMP=OFF \ -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ -DKokkos_ENABLE_CUDA_UVM=ON \ -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ @@ -450,6 +453,7 @@ pipeline { -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_LIBDL=OFF \ .. && \ make -j8 && ctest --verbose && \ diff --git a/Makefile.kokkos b/Makefile.kokkos index 23bd8288fc..1234f4cc9e 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -665,13 +665,15 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") - KOKKOS_CXXFLAGS += -expt-extended-lambda - endif + ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") + KOKKOS_CXXFLAGS += -expt-extended-lambda + endif - ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") + endif endif ifeq ($(KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR), 1) diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 2b1ea092e0..bcfa16d742 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -35,6 +35,7 @@ #cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_CUDA_UVM +#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 2137344c5e..2ed5d1c610 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -162,21 +162,10 @@ ENDIF() #clear anything that might be in the cache GLOBAL_SET(KOKKOS_CUDA_OPTIONS) # Construct the Makefile options -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - # Extended lambda support was stabilized in nvcc 12 - IF(KOKKOS_COMPILER_VERSION_MAJOR EQUAL 11) +IF (KOKKOS_ENABLE_CUDA_LAMBDA) + IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-extended-lambda") - ELSE() - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-extended-lambda") - ENDIF() - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") -ENDIF() - -IF(DEFINED Kokkos_ENABLE_CUDA_LAMBDA) - IF(Kokkos_ENABLE_CUDA_LAMBDA) - MESSAGE(DEPRECATION "CUDA extended lambda support is now always enabled. The option Kokkos_ENABLE_CUDA_LAMBDA will be removed") - ELSE() - MESSAGE(FATAL_ERROR "Support for disabling CUDA extended lambdas has been removed. Please unset Kokkos_ENABLE_CUDA_LAMBDA, or see #5964 if this is necessary for your application") + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") ENDIF() ENDIF() diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index 145815c7f2..7d8026989a 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -67,6 +67,14 @@ mark_as_advanced(Kokkos_ENABLE_IMPL_MDSPAN) mark_as_advanced(Kokkos_ENABLE_MDSPAN_EXTERNAL) mark_as_advanced(Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) +IF (Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) + SET(CUDA_LAMBDA_DEFAULT ON) +ELSEIF (KOKKOS_ENABLE_CUDA) + SET(CUDA_LAMBDA_DEFAULT ON) +ELSE() + SET(CUDA_LAMBDA_DEFAULT OFF) +ENDIF() +KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to activate experimental lambda features") IF (Trilinos_ENABLE_Kokkos) SET(COMPLEX_ALIGN_DEFAULT OFF) ELSE() @@ -115,7 +123,7 @@ FUNCTION(check_device_specific_options) ENDIF() ENDFUNCTION() -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_CONSTEXPR CUDA_LDG_INTRINSIC) +CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC) CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE) CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) diff --git a/containers/unit_tests/TestErrorReporter.hpp b/containers/unit_tests/TestErrorReporter.hpp index 7d7765cf8c..0003a29468 100644 --- a/containers/unit_tests/TestErrorReporter.hpp +++ b/containers/unit_tests/TestErrorReporter.hpp @@ -149,6 +149,7 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase { } }; +#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) template struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase { @@ -177,6 +178,7 @@ struct ErrorReporterDriverUseLambda driver_base::check_expectations(reporter_capacity, test_size); } }; +#endif #ifdef KOKKOS_ENABLE_OPENMP struct ErrorReporterDriverNativeOpenMP diff --git a/containers/unit_tests/TestOffsetView.hpp b/containers/unit_tests/TestOffsetView.hpp index c225d65b69..c133922e3d 100644 --- a/containers/unit_tests/TestOffsetView.hpp +++ b/containers/unit_tests/TestOffsetView.hpp @@ -67,6 +67,7 @@ void test_offsetview_construction() { ASSERT_EQ(ov.extent(0), 5u); ASSERT_EQ(ov.extent(1), 5u); +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) { Kokkos::Experimental::OffsetView offsetV1("OneDOffsetView", range0); @@ -148,6 +149,7 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; +#endif { offset_view_type ovCopy(ov); @@ -182,6 +184,7 @@ void test_offsetview_construction() { range3_type rangePolicy3DZero(point3_type{{0, 0, 0}}, point3_type{{extent0, extent1, extent2}}); +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -204,6 +207,7 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; +#endif } view_type viewFromOV = ov.view(); @@ -228,6 +232,7 @@ void test_offsetview_construction() { view_type aView("aView", ov.extent(0), ov.extent(1)); Kokkos::deep_copy(aView, ov); +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -237,6 +242,7 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; +#endif } { // test view to offsetview deep copy @@ -245,6 +251,7 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, 99); Kokkos::deep_copy(ov, aView); +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -254,6 +261,7 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; +#endif } } @@ -421,6 +429,7 @@ void test_offsetview_subview() { ASSERT_EQ(offsetSubview.begin(1), 0); ASSERT_EQ(offsetSubview.end(1), 9); +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -446,6 +455,7 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); +#endif } // slice 2 @@ -542,6 +552,7 @@ void test_offsetview_subview() { } } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) template KOKKOS_INLINE_FUNCTION T std_accumulate(InputIt first, InputIt last, T init, BinaryOperation op) { @@ -644,6 +655,7 @@ void test_offsetview_offsets_rank3() { ASSERT_EQ(0, errors); } +#endif TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction(); @@ -657,6 +669,7 @@ TEST(TEST_CATEGORY, offsetview_subview) { test_offsetview_subview(); } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) TEST(TEST_CATEGORY, offsetview_offsets_rank1) { test_offsetview_offsets_rank1(); } @@ -668,6 +681,7 @@ TEST(TEST_CATEGORY, offsetview_offsets_rank2) { TEST(TEST_CATEGORY, offsetview_offsets_rank3) { test_offsetview_offsets_rank3(); } +#endif } // namespace Test diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 2361e45ce6..66319f43f5 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -173,10 +173,12 @@ KOKKOS_ADD_BENCHMARK( SOURCES ${BENCHMARK_SOURCES} ) +IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) KOKKOS_ADD_BENCHMARK( Benchmark_Atomic_MinMax SOURCES test_atomic_minmax_simple.cpp ) +ENDIF() # FIXME_NVHPC IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) diff --git a/core/perf_test/PerfTest_ViewAllocate.cpp b/core/perf_test/PerfTest_ViewAllocate.cpp index 8ee69cfa59..63f1d6b2c7 100644 --- a/core/perf_test/PerfTest_ViewAllocate.cpp +++ b/core/perf_test/PerfTest_ViewAllocate.cpp @@ -217,6 +217,7 @@ BENCHMARK(ViewAllocate_Rank8) ->Arg(N) ->UseManualTime(); +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewAllocate_Raw) ->ArgName("N") ->Arg(N) @@ -226,5 +227,6 @@ BENCHMARK(ViewAllocate_Raw) ->ArgName("N") ->Arg(N) ->UseManualTime(); +#endif } // namespace Test diff --git a/core/perf_test/PerfTest_ViewCopy_Raw.cpp b/core/perf_test/PerfTest_ViewCopy_Raw.cpp index e4db40e128..67a8d7e555 100644 --- a/core/perf_test/PerfTest_ViewCopy_Raw.cpp +++ b/core/perf_test/PerfTest_ViewCopy_Raw.cpp @@ -18,6 +18,7 @@ namespace Test { +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) @@ -37,5 +38,6 @@ BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) ->UseManualTime(); +#endif } // namespace Test diff --git a/core/perf_test/PerfTest_ViewFill_Raw.cpp b/core/perf_test/PerfTest_ViewFill_Raw.cpp index 57bba83a9c..c11074d915 100644 --- a/core/perf_test/PerfTest_ViewFill_Raw.cpp +++ b/core/perf_test/PerfTest_ViewFill_Raw.cpp @@ -18,6 +18,7 @@ namespace Test { +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) @@ -27,5 +28,6 @@ BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) ->UseManualTime(); +#endif } // namespace Test diff --git a/core/perf_test/PerfTest_ViewResize_Raw.cpp b/core/perf_test/PerfTest_ViewResize_Raw.cpp index ab469cb647..2d1bcbb3ca 100644 --- a/core/perf_test/PerfTest_ViewResize_Raw.cpp +++ b/core/perf_test/PerfTest_ViewResize_Raw.cpp @@ -18,6 +18,7 @@ namespace Test { +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewResize_NoInit_Raw) ->ArgName("N") ->Arg(N) @@ -29,5 +30,6 @@ BENCHMARK(ViewResize_NoInit_Raw) ->Arg(N) ->UseManualTime() ->Iterations(R); +#endif } // namespace Test diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 0c830007c1..8cc4a6efa3 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -547,8 +547,6 @@ static constexpr bool kokkos_omp_on_host() { return false; } #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) #define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC -// This was previously defined from the configuration option which was removed -#define KOKKOS_ENABLE_CUDA_LAMBDA #endif #define KOKKOS_INVALID_INDEX (~std::size_t(0)) diff --git a/core/src/setup/Kokkos_Setup_Cuda.hpp b/core/src/setup/Kokkos_Setup_Cuda.hpp index 1130485e84..c57f690ae1 100644 --- a/core/src/setup/Kokkos_Setup_Cuda.hpp +++ b/core/src/setup/Kokkos_Setup_Cuda.hpp @@ -53,9 +53,15 @@ #error "Cuda device capability >= 3.0 is required." #endif +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA #define KOKKOS_LAMBDA [=] __host__ __device__ + #define KOKKOS_CLASS_LAMBDA [ =, *this ] __host__ __device__ +#else // !defined(KOKKOS_ENABLE_CUDA_LAMBDA) +#undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA +#endif // !defined(KOKKOS_ENABLE_CUDA_LAMBDA) + #define KOKKOS_IMPL_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__ #define KOKKOS_IMPL_FORCEINLINE __forceinline__ #define KOKKOS_IMPL_INLINE_FUNCTION __device__ __host__ inline diff --git a/core/unit_test/TestCompilerMacros.cpp b/core/unit_test/TestCompilerMacros.cpp index 5927d142de..b77368037e 100644 --- a/core/unit_test/TestCompilerMacros.cpp +++ b/core/unit_test/TestCompilerMacros.cpp @@ -28,9 +28,15 @@ #error "Only one host compiler macro can be defined" #endif +#if defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA) +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +#error "Macro bug: KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA shouldn't be defined" +#endif +#else #if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #error "Macro bug: KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA should be defined" #endif +#endif namespace TestCompilerMacros { diff --git a/core/unit_test/TestMDRangeReduce.hpp b/core/unit_test/TestMDRangeReduce.hpp index 24bd3255fe..007fa420c3 100644 --- a/core/unit_test/TestMDRangeReduce.hpp +++ b/core/unit_test/TestMDRangeReduce.hpp @@ -49,6 +49,8 @@ TEST(TEST_CATEGORY, mdrange_parallel_reduce_primitive_types) { #if defined(KOKKOS_ENABLE_OPENMPTARGET) GTEST_SKIP() << "FIXME OPENMPTARGET Tests of MDRange reduce over values " "smaller than int would fail"; +#elif defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA) + GTEST_SKIP() << "Skipped ENABLE_CUDA_LAMBDA"; #else for (int bound : {0, 1, 7, 32, 65, 7000}) { for (int k = 0; k < bound; ++k) { diff --git a/core/unit_test/TestTeamMDRange.hpp b/core/unit_test/TestTeamMDRange.hpp index 7f4068a09b..8ac7e8338c 100644 --- a/core/unit_test/TestTeamMDRange.hpp +++ b/core/unit_test/TestTeamMDRange.hpp @@ -148,6 +148,10 @@ struct TestTeamMDParallelFor { } }; +// If KOKKOS_ENABLE_CUDA_LAMBDA is off, extended lambdas used in parallel_for +// and parallel_reduce in these tests will not compile correctly +#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) + template struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { using TeamType = typename Kokkos::TeamPolicy::member_type; @@ -1959,5 +1963,7 @@ TEST(TEST_CATEGORY, TeamVectorMDRangeParallelReduce) { test_parallel_reduce_for_8D_TeamVectorMDRange(smallDims); } +#endif + } // namespace TeamMDRange } // namespace Test diff --git a/core/unit_test/TestViewMapping_a.hpp b/core/unit_test/TestViewMapping_a.hpp index dc576577c2..9173f0d431 100644 --- a/core/unit_test/TestViewMapping_a.hpp +++ b/core/unit_test/TestViewMapping_a.hpp @@ -1038,16 +1038,16 @@ void test_view_mapping() { ASSERT_EQ(a.use_count(), 1); ASSERT_EQ(b.use_count(), 0); +#if !defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_ENABLE_CUDA_LAMBDA) + // Cannot launch host lambda when CUDA lambda is enabled. + using host_exec_space = typename Kokkos::Impl::HostMirror::Space::execution_space; int errors = 0; Kokkos::parallel_reduce( Kokkos::RangePolicy(0, 10), - // FIXME_NVCC: Cannot launch __host__ __device__ lambda on - // host when CUDA lambda is enabled, so use plain [=] instead - // of KOKKOS_LAMBDA - [=](int, int& e) { + KOKKOS_LAMBDA(int, int& e) { // an unmanaged copy. When the parallel dispatch accepts a move for // the lambda, this count should become 1. @@ -1058,6 +1058,7 @@ void test_view_mapping() { }, errors); ASSERT_EQ(errors, 0); +#endif // #if !defined( KOKKOS_ENABLE_CUDA_LAMBDA ) } } diff --git a/generate_makefile.bash b/generate_makefile.bash index 47321c5e14..018426c9b8 100755 --- a/generate_makefile.bash +++ b/generate_makefile.bash @@ -64,7 +64,9 @@ get_kokkos_cuda_option_list() { for CUDA_ in $PARSE_CUDA_LST do CUDA_OPT_NAME= - if [ "${CUDA_}" == "rdc" ]; then + if [ "${CUDA_}" == "enable_lambda" ]; then + CUDA_OPT_NAME=CUDA_LAMBDA + elif [ "${CUDA_}" == "rdc" ]; then CUDA_OPT_NAME=CUDA_RELOCATABLE_DEVICE_CODE elif [ "${CUDA_}" == "force_uvm" ]; then CUDA_OPT_NAME=CUDA_UVM @@ -229,7 +231,7 @@ display_help_text() { echo " disable_profiling = do not compile with profiling hooks" echo " " echo "--with-cuda-options=[OPT]: Additional options to CUDA:" - echo " force_uvm, use_ldg, rdc" + echo " force_uvm, use_ldg, enable_lambda, rdc" echo "--with-hip-options=[OPT]: Additional options to HIP:" echo " rdc" echo "--with-hpx-options=[OPT]: Additional options to HPX:" diff --git a/scripts/trilinos-integration/waterman_cuda_env.sh b/scripts/trilinos-integration/waterman_cuda_env.sh index 0301eb0717..445b4f9697 100755 --- a/scripts/trilinos-integration/waterman_cuda_env.sh +++ b/scripts/trilinos-integration/waterman_cuda_env.sh @@ -30,7 +30,7 @@ export CUDA_LAUNCH_BLOCKING=1 export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 -export KOKKOS_EXTRA_FLAGS="" +export KOKKOS_EXTRA_FLAGS="-DKokkos_ENABLE_CUDA_LAMBDA=ON" scriptdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" echo "DIR=$scriptdir" NVCC_WRAPPER=`realpath $scriptdir/../../bin/nvcc_wrapper` diff --git a/scripts/trilinos-integration/white_cuda_env.sh b/scripts/trilinos-integration/white_cuda_env.sh index 4f6f4bcd9a..f3745ede8c 100755 --- a/scripts/trilinos-integration/white_cuda_env.sh +++ b/scripts/trilinos-integration/white_cuda_env.sh @@ -31,7 +31,7 @@ export CUDA_LAUNCH_BLOCKING=1 export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 -export KOKKOS_EXTRA_FLAGS="" +export KOKKOS_EXTRA_FLAGS="-DKokkos_ENABLE_CUDA_LAMBDA=ON" scriptdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" NVCC_WRAPPER=`realpath $scriptdir/../../bin/nvcc_wrapper` export OMPI_CXX=$NVCC_WRAPPER From 1c0e3bf3fefa016405bbfa92bc8f0a35f79ceed8 Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Mon, 8 May 2023 09:00:36 -0400 Subject: [PATCH 423/496] Update the OpenACC parallel_reduce() constructs with Range/MDRange/Team (#6072) * Update the OpenACC parallel_reduce() constructs with Range/MDRange/Team Policy to support reductions on device data. * Update as suggested by the code review. * Add comments as suggested by the code review. * Undo the unit test CMake change. * Update the OpenACC parallel_reduce() implementations to correctly handle the cases where the number of iterations is zero. Update reduction-related unit tests to disable unsupported tests for the OpenACC backend. Update CMakeLists.txt in the unit test to enable reduction-related unit tests supported by the OpenACC backend. * Re-enabled supported unit tests. * Disable TestOpenACC_Reducers_a.cpp since it fails when compiled by NVHPC V22.5 or older * Disable unsupported unit test. --- .../Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 33 +++++++++++++--- .../Kokkos_OpenACC_ParallelReduce_Range.hpp | 33 +++++++++++++--- .../Kokkos_OpenACC_ParallelReduce_Team.hpp | 38 ++++++++++++++---- core/unit_test/CMakeLists.txt | 39 +------------------ core/unit_test/TestReduce.hpp | 6 +++ core/unit_test/TestReducers.hpp | 18 +++++++++ 6 files changed, 112 insertions(+), 55 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 0ebd8b219f..2c7793dc11 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -51,6 +51,7 @@ class Kokkos::Impl::ParallelReduce @@ -58,22 +59,32 @@ class Kokkos::Impl::ParallelReduce::accessible) {} void execute() const { static_assert(1 < Policy::rank && Policy::rank < 7); static_assert(Policy::inner_direction == Iterate::Left || Policy::inner_direction == Iterate::Right); constexpr int rank = Policy::rank; + ValueType val; + const ReducerType& reducer = m_functor_reducer.get_reducer(); + reducer.init(&val); + for (int i = 0; i < rank; ++i) { if (m_policy.m_lower[i] >= m_policy.m_upper[i]) { + if (m_result_ptr_on_device) { + acc_memcpy_to_device(m_result_ptr, &val, sizeof(ValueType)); + } else { + *m_result_ptr = val; + } return; } } - ValueType val; - const ReducerType& reducer = m_functor_reducer.get_reducer(); - reducer.init(&val); + int const async_arg = m_policy.space().acc_async_queue(); Kokkos::Experimental::Impl::OpenACCParallelReduceMDRangeHelper( Kokkos::Experimental::Impl::FunctorAdapter< @@ -85,8 +96,20 @@ class Kokkos::Impl::ParallelReduce, typename ReducerType::functor_type>(val), m_policy); + // OpenACC backend supports only built-in Reducer types; thus + // reducer.final() below is a no-op. reducer.final(&val); - *m_result_ptr = val; + // acc_wait(async_arg) in the below if-else statements is needed because the + // above OpenACC compute kernel can be executed asynchronously and val is a + // local host variable. + if (m_result_ptr_on_device) { + acc_memcpy_to_device_async(m_result_ptr, &val, sizeof(ValueType), + async_arg); + acc_wait(async_arg); + } else { + acc_wait(async_arg); + *m_result_ptr = val; + } } }; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp index e70b8997f0..b61a05a8ee 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp @@ -52,6 +52,7 @@ class Kokkos::Impl::ParallelReduce @@ -59,19 +60,29 @@ class Kokkos::Impl::ParallelReduce::accessible) {} void execute() const { auto const begin = m_policy.begin(); auto const end = m_policy.end(); + ValueType val; + ReducerType const& reducer = m_functor_reducer.get_reducer(); + reducer.init(&val); + if (end <= begin) { + if (m_result_ptr_on_device == false) { + *m_result_ptr = val; + } else { + acc_memcpy_to_device(m_result_ptr, &val, sizeof(ValueType)); + } return; } - ValueType val; - ReducerType const& reducer = m_functor_reducer.get_reducer(); - reducer.init(&val); + int const async_arg = m_policy.space().acc_async_queue(); Kokkos::Experimental::Impl::OpenACCParallelReduceHelper( Kokkos::Experimental::Impl::FunctorAdapter< @@ -83,8 +94,20 @@ class Kokkos::Impl::ParallelReduce, typename ReducerType::functor_type>(val), m_policy); + // OpenACC backend supports only built-in Reducer types; thus + // reducer.final() below is a no-op. reducer.final(&val); - *m_result_ptr = val; + // acc_wait(async_arg) in the below if-else statements is needed because the + // above OpenACC compute kernel can be executed asynchronously and val is a + // local host variable. + if (m_result_ptr_on_device == false) { + acc_wait(async_arg); + *m_result_ptr = val; + } else { + acc_memcpy_to_device_async(m_result_ptr, &val, sizeof(ValueType), + async_arg); + acc_wait(async_arg); + } } }; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index d572072aba..3223ce3f9a 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -63,6 +63,7 @@ class Kokkos::Impl::ParallelReduce, - Sum, typename ReducerType::functor_type>(tmp), + Sum, typename ReducerType::functor_type>(val), m_policy); - reducer.final(&tmp); - - m_result_ptr[0] = tmp; + // OpenACC backend supports only built-in Reducer types; thus + // reducer.final() below is a no-op. + reducer.final(&val); + // acc_wait(async_arg) in the below if-else statements is needed because the + // above OpenACC compute kernel can be executed asynchronously and val is a + // local host variable. + if (m_result_ptr_on_device == false) { + acc_wait(async_arg); + *m_result_ptr = val; + } else { + acc_memcpy_to_device_async(m_result_ptr, &val, sizeof(value_type), + async_arg); + acc_wait(async_arg); + } } template @@ -93,7 +114,10 @@ class Kokkos::Impl::ParallelReduce::accessible) {} }; namespace Kokkos { diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 0b48eba9ea..2c4262e3ff 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -367,23 +367,16 @@ if(Kokkos_ENABLE_OPENACC) list(REMOVE_ITEM OpenACC_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexdouble.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyInterface.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Crs.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_JoinBackwardCompatibility.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_LocalDeepCopy.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Other.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamMDRange.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamReductionScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_e.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewCopy_a.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewCopy_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_subview.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewOfClass.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_WorkGraph.cpp @@ -492,61 +485,31 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyInterface.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtrAccessOnDevice.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions1.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions2.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions3.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_a.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_c.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_d.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_e.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_f.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_g.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_NumericTraits.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicy.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyRequire.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reduce.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_a.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_c.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_a.cpp #fails if NVHPC V22.5 or lower. ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_e.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_a.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c01.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c02.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c04.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c06.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c07.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c09.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c10.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c12.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c13.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_a.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_c.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_d.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_f.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp ) diff --git a/core/unit_test/TestReduce.hpp b/core/unit_test/TestReduce.hpp index 4cf30f6fbe..e1aa851f10 100644 --- a/core/unit_test/TestReduce.hpp +++ b/core/unit_test/TestReduce.hpp @@ -369,7 +369,10 @@ class TestReduceDynamic { TestReduceDynamic(const size_type nwork) { run_test_dynamic(nwork); +#ifndef KOKKOS_ENABLE_OPENACC + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. run_test_dynamic_minmax(nwork); +#endif run_test_dynamic_final(nwork); } @@ -542,6 +545,8 @@ TEST(TEST_CATEGORY, int64_t_reduce_dynamic_view) { // FIXME_OPENMPTARGET: Not yet implemented. #ifndef KOKKOS_ENABLE_OPENMPTARGET +// FIXME_OPENACC: Not yet implemented. +#ifndef KOKKOS_ENABLE_OPENACC TEST(TEST_CATEGORY, int_combined_reduce) { using functor_type = CombinedReduceFunctorSameType; constexpr uint64_t nw = 1000; @@ -619,4 +624,5 @@ TEST(TEST_CATEGORY, int_combined_reduce_mixed) { } } #endif +#endif } // namespace Test diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp index 633b203afe..621cb28c9e 100644 --- a/core/unit_test/TestReducers.hpp +++ b/core/unit_test/TestReducers.hpp @@ -982,14 +982,23 @@ struct TestReducers { test_sum(10001); test_prod(35); test_min(10003); +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); +#endif test_max(10007); +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); +#endif #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \ (KOKKOS_COMPILER_CLANG < 1300) // FIXME_OPENMPTARGET - The minmaxloc test fails llvm <= 13 version. #else +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minmaxloc(10007); +#endif #endif } @@ -1000,14 +1009,23 @@ struct TestReducers { test_sum(10001); test_prod(sizeof(Scalar) > 4 ? 35 : 19); // avoid int overflow (see above) test_min(10003); +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); +#endif test_max(10007); +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); +#endif #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \ (KOKKOS_COMPILER_CLANG < 1300) // FIXME_OPENMPTARGET - The minmaxloc test fails llvm <= 13 version. #else +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minmaxloc(10007); +#endif #endif test_BAnd(35); test_BOr(35); From 798efc5a38e8e57a4982bc235a301ccd46a12a4d Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 8 May 2023 12:04:45 -0400 Subject: [PATCH 424/496] Always pass -extended-lambda option to NVCC and force Kokkos_ENABLE_CUDA_LAMBDA ON --- Makefile.kokkos | 14 ++++++-------- cmake/KokkosCore_config.h.in | 2 +- cmake/kokkos_arch.cmake | 8 +++----- cmake/kokkos_enable_options.cmake | 19 +++++++++++-------- 4 files changed, 21 insertions(+), 22 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 1234f4cc9e..dd272ad55a 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -665,15 +665,13 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1) - ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") - KOKKOS_CXXFLAGS += -expt-extended-lambda - endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") + KOKKOS_CXXFLAGS += -extended-lambda + endif - ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") - endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") endif ifeq ($(KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR), 1) diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index bcfa16d742..97af9ff88c 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -35,7 +35,7 @@ #cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_CUDA_UVM -#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA +#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA // deprecated #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 2ed5d1c610..46bc0dc260 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -162,11 +162,9 @@ ENDIF() #clear anything that might be in the cache GLOBAL_SET(KOKKOS_CUDA_OPTIONS) # Construct the Makefile options -IF (KOKKOS_ENABLE_CUDA_LAMBDA) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-extended-lambda") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") - ENDIF() +IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-extended-lambda") + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") ENDIF() IF (KOKKOS_ENABLE_CUDA_CONSTEXPR) diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index 7d8026989a..96b9413999 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -67,14 +67,7 @@ mark_as_advanced(Kokkos_ENABLE_IMPL_MDSPAN) mark_as_advanced(Kokkos_ENABLE_MDSPAN_EXTERNAL) mark_as_advanced(Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) -IF (Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSEIF (KOKKOS_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSE() - SET(CUDA_LAMBDA_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to activate experimental lambda features") +KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ON "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**") IF (Trilinos_ENABLE_Kokkos) SET(COMPLEX_ALIGN_DEFAULT OFF) ELSE() @@ -168,6 +161,16 @@ IF(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") ENDIF() ENDIF() +IF(NOT Kokkos_ENABLE_CUDA_LAMBDA) + IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) + MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON") + set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "Kokkos turned Cuda lambda support ON!" FORCE) + set(KOKKOS_ENABLE_CUDA_LAMBDA ON) + ELSE() + MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") + ENDIF() +ENDIF() + IF(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) MESSAGE(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") From cfbaf28e03a10f913ff78e7f12dae0638723a7b5 Mon Sep 17 00:00:00 2001 From: Thomas Conrad Clevenger Date: Mon, 8 May 2023 13:28:13 -0600 Subject: [PATCH 425/496] Reorganize ZeroMemset (#6087) * Reorg ZeroMemset - Move fwd decl to new impl/Kokkos_ZeroMemset_fwd.hpp - Move Cuda ZeroMemset to Cuda/Kokkos_Cuda_ZeroMemset.hpp - Move HIP ZeroMemset to HIP/Kokkos_HIP_ZeroMemset.hpp * Make new headers self-contained * Remove left over #endif Co-authored-by: Daniel Arndt * Separate default implementaion * Move SYCL and Serial impl to separate file * Fixes - indent - remove unecessary iostream - make new headers self contained * Change hostspace default file - name change: impl/Kokkos_ZeroMemset.hpp -> impl/Kokkos_HostSpace_ZeroMemset.hpp - move default exec impl back to copy views * Test: Use fwd decl in Kokkos_ViewMapping.hpp * template on ViewType * Add include for default HostSpace impl * Remove typename keyword in ZeroMemset input Co-authored-by: Damien L-G * fix indent --------- Co-authored-by: Daniel Arndt Co-authored-by: Damien L-G --- core/src/Cuda/Kokkos_Cuda.hpp | 22 -------- core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp | 46 ++++++++++++++++ core/src/HIP/Kokkos_HIP.hpp | 20 ------- core/src/HIP/Kokkos_HIP_ZeroMemset.hpp | 46 ++++++++++++++++ core/src/Kokkos_CopyViews.hpp | 15 +++--- core/src/Kokkos_Core_fwd.hpp | 3 -- core/src/Kokkos_HostSpace.hpp | 21 -------- core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp | 20 ------- core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp | 47 +++++++++++++++++ core/src/Serial/Kokkos_Serial.hpp | 17 ------ core/src/Serial/Kokkos_Serial_ZeroMemset.hpp | 50 ++++++++++++++++++ core/src/decl/Kokkos_Declare_CUDA.hpp | 1 + core/src/decl/Kokkos_Declare_HIP.hpp | 1 + core/src/decl/Kokkos_Declare_SERIAL.hpp | 1 + core/src/decl/Kokkos_Declare_SYCL.hpp | 1 + core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp | 52 +++++++++++++++++++ core/src/impl/Kokkos_ViewMapping.hpp | 11 ++-- core/src/impl/Kokkos_ZeroMemset_fwd.hpp | 29 +++++++++++ 18 files changed, 289 insertions(+), 114 deletions(-) create mode 100644 core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp create mode 100644 core/src/HIP/Kokkos_HIP_ZeroMemset.hpp create mode 100644 core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp create mode 100644 core/src/Serial/Kokkos_Serial_ZeroMemset.hpp create mode 100644 core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp create mode 100644 core/src/impl/Kokkos_ZeroMemset_fwd.hpp diff --git a/core/src/Cuda/Kokkos_Cuda.hpp b/core/src/Cuda/Kokkos_Cuda.hpp index 3e237a65db..6c78a7984d 100644 --- a/core/src/Cuda/Kokkos_Cuda.hpp +++ b/core/src/Cuda/Kokkos_Cuda.hpp @@ -241,28 +241,6 @@ struct DeviceTypeTraits { }; } // namespace Experimental } // namespace Tools - -namespace Impl { - -template -struct ZeroMemset { - ZeroMemset(const Kokkos::Cuda& exec_space_instance, - const View& dst, - typename View::const_value_type&) { - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemsetAsync( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type), - exec_space_instance.cuda_stream())); - } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaMemset(dst.data(), 0, - dst.size() * sizeof(typename View::value_type))); - } -}; -} // namespace Impl } // namespace Kokkos /*--------------------------------------------------------------------------*/ diff --git a/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp new file mode 100644 index 0000000000..001cf0cce0 --- /dev/null +++ b/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp @@ -0,0 +1,46 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOS_CUDA_ZEROMEMSET_HPP +#define KOKKOS_CUDA_ZEROMEMSET_HPP + +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +struct ZeroMemset> { + ZeroMemset(const Kokkos::Cuda& exec_space_instance, const View& dst, + typename View::const_value_type&) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemsetAsync( + dst.data(), 0, dst.size() * sizeof(typename View::value_type), + exec_space_instance.cuda_stream())); + } + + ZeroMemset(const View& dst, + typename View::const_value_type&) { + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMemset(dst.data(), 0, + dst.size() * sizeof(typename View::value_type))); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // !defined(KOKKOS_CUDA_ZEROMEMSET_HPP) diff --git a/core/src/HIP/Kokkos_HIP.hpp b/core/src/HIP/Kokkos_HIP.hpp index d48be32280..d7cc25ffe1 100644 --- a/core/src/HIP/Kokkos_HIP.hpp +++ b/core/src/HIP/Kokkos_HIP.hpp @@ -137,26 +137,6 @@ struct DeviceTypeTraits { }; } // namespace Experimental } // namespace Tools - -namespace Impl { -template -struct ZeroMemset { - ZeroMemset(const HIP& exec_space, const View& dst, - typename View::const_value_type&) { - KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type), - exec_space.hip_stream())); - } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - KOKKOS_IMPL_HIP_SAFE_CALL( - hipMemset(dst.data(), 0, - dst.size() * sizeof(typename View::value_type))); - } -}; -} // namespace Impl } // namespace Kokkos #endif diff --git a/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp new file mode 100644 index 0000000000..5c40d0fbc8 --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp @@ -0,0 +1,46 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOS_HIP_ZEROMEMSET_HPP +#define KOKKOS_HIP_ZEROMEMSET_HPP + +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +struct ZeroMemset> { + ZeroMemset(const HIP& exec_space, const View& dst, + typename View::const_value_type&) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync( + dst.data(), 0, dst.size() * sizeof(typename View::value_type), + exec_space.hip_stream())); + } + + ZeroMemset(const View& dst, + typename View::const_value_type&) { + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemset(dst.data(), 0, + dst.size() * sizeof(typename View::value_type))); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // !defined(KOKKOS_HIP_ZEROMEMSET_HPP) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 7bc07ff54c..02b9cc8bd1 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -25,6 +25,7 @@ static_assert(false, #include #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -1326,15 +1327,15 @@ inline void contiguous_fill( exec_space); } -template +// Default implementation for execution spaces that don't provide a definition +template struct ZeroMemset { - ZeroMemset(const ExecutionSpace& exec_space, const View& dst, - typename ViewTraits::const_value_type& value) { + ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst, + typename ViewType::const_value_type& value) { contiguous_fill(exec_space, dst, value); } - ZeroMemset(const View& dst, - typename ViewTraits::const_value_type& value) { + ZeroMemset(const ViewType& dst, typename ViewType::const_value_type& value) { contiguous_fill(ExecutionSpace(), dst, value); } }; @@ -1351,7 +1352,7 @@ contiguous_fill_or_memset( // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - ZeroMemset(exec_space, dst, value); + ZeroMemset>(exec_space, dst, value); else #endif contiguous_fill(exec_space, dst, value); @@ -1383,7 +1384,7 @@ contiguous_fill_or_memset( // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - ZeroMemset(dst, value); + ZeroMemset>(dst, value); else #endif contiguous_fill(exec_space_type(), dst, value); diff --git a/core/src/Kokkos_Core_fwd.hpp b/core/src/Kokkos_Core_fwd.hpp index 883807f9d2..525e8ed140 100644 --- a/core/src/Kokkos_Core_fwd.hpp +++ b/core/src/Kokkos_Core_fwd.hpp @@ -297,9 +297,6 @@ template struct DeepCopy; -template -struct ZeroMemset; - template diff --git a/core/src/Kokkos_HostSpace.hpp b/core/src/Kokkos_HostSpace.hpp index 0c8cd43a04..4990337ab9 100644 --- a/core/src/Kokkos_HostSpace.hpp +++ b/core/src/Kokkos_HostSpace.hpp @@ -241,27 +241,6 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset { - ZeroMemset(const typename HostSpace::execution_space& exec, - const View& dst, - typename View::const_value_type&) { - // Host spaces, except for HPX, are synchronous and we need to fence for HPX - // since we can't properly enqueue a std::memset otherwise. - // We can't use exec.fence() directly since we don't have a full definition - // of HostSpace here. - hostspace_fence(exec); - using ValueType = typename View::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); - } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - using ValueType = typename View::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); - } -}; - template <> struct DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { diff --git a/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp index c8285584b3..afc7eebd38 100644 --- a/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp +++ b/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp @@ -27,26 +27,6 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset { - ZeroMemset(const Kokkos::Experimental::SYCL& exec_space, - const View& dst, - typename View::const_value_type&) { - auto event = exec_space.impl_internal_space_instance()->m_queue->memset( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type)); - exec_space.impl_internal_space_instance() - ->m_queue->ext_oneapi_submit_barrier(std::vector{event}); - } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memset( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type)); - } -}; - void DeepCopySYCL(void* dst, const void* src, size_t n); void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, const void* src, size_t n); diff --git a/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp new file mode 100644 index 0000000000..883c323f2f --- /dev/null +++ b/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp @@ -0,0 +1,47 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SYCL_ZEROMEMSET_HPP +#define KOKKOS_SYCL_ZEROMEMSET_HPP + +#include +#include + +namespace Kokkos { +namespace Impl { + +template +struct ZeroMemset> { + ZeroMemset(const Kokkos::Experimental::SYCL& exec_space, + const View& dst, + typename View::const_value_type&) { + auto event = exec_space.impl_internal_space_instance()->m_queue->memset( + dst.data(), 0, dst.size() * sizeof(typename View::value_type)); + exec_space.impl_internal_space_instance() + ->m_queue->ext_oneapi_submit_barrier(std::vector{event}); + } + + ZeroMemset(const View& dst, + typename View::const_value_type&) { + Experimental::Impl::SYCLInternal::singleton().m_queue->memset( + dst.data(), 0, dst.size() * sizeof(typename View::value_type)); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // !defined(KOKKOS_SYCL_ZEROMEMSET_HPP) diff --git a/core/src/Serial/Kokkos_Serial.hpp b/core/src/Serial/Kokkos_Serial.hpp index ede3c96b8b..2ade37705e 100644 --- a/core/src/Serial/Kokkos_Serial.hpp +++ b/core/src/Serial/Kokkos_Serial.hpp @@ -207,23 +207,6 @@ struct DeviceTypeTraits { namespace Kokkos { namespace Impl { -// We only need to provide a specialization for Serial if there is a host -// parallel execution space since the specialization for -// DefaultHostExecutionSpace is defined elsewhere. -struct DummyExecutionSpace; -template -struct ZeroMemset< - std::conditional_t::value, - Serial, DummyExecutionSpace>, - DT, DP...> : public ZeroMemset { - using Base = ZeroMemset; - using Base::Base; - - ZeroMemset(const Serial&, const View& dst, - typename View::const_value_type& value) - : Base(dst, value) {} -}; - template <> struct MemorySpaceAccess { diff --git a/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp new file mode 100644 index 0000000000..3ec2dfbcfa --- /dev/null +++ b/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp @@ -0,0 +1,50 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SERIAL_ZEROMEMSET_HPP +#define KOKKOS_SERIAL_ZEROMEMSET_HPP + +#include +#include +#include + +#include + +namespace Kokkos { +namespace Impl { + +// We only need to provide a specialization for Serial if there is a host +// parallel execution space since the specialization for +// DefaultHostExecutionSpace is defined elsewhere. +struct DummyExecutionSpace; +template +struct ZeroMemset< + std::conditional_t::value, + Serial, DummyExecutionSpace>, + View> + : public ZeroMemset> { + using Base = ZeroMemset>; + using Base::Base; + + ZeroMemset(const Serial&, const View& dst, + typename View::const_value_type& value) + : Base(dst, value) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // !defined(KOKKOS_SERIAL_ZEROMEMSET_HPP) diff --git a/core/src/decl/Kokkos_Declare_CUDA.hpp b/core/src/decl/Kokkos_Declare_CUDA.hpp index 79d432a35e..ebdf2c8211 100644 --- a/core/src/decl/Kokkos_Declare_CUDA.hpp +++ b/core/src/decl/Kokkos_Declare_CUDA.hpp @@ -31,6 +31,7 @@ #include #include #include +#include #endif #endif diff --git a/core/src/decl/Kokkos_Declare_HIP.hpp b/core/src/decl/Kokkos_Declare_HIP.hpp index 09ea882600..e115f7051f 100644 --- a/core/src/decl/Kokkos_Declare_HIP.hpp +++ b/core/src/decl/Kokkos_Declare_HIP.hpp @@ -30,6 +30,7 @@ #include #include #include +#include namespace Kokkos { namespace Experimental { diff --git a/core/src/decl/Kokkos_Declare_SERIAL.hpp b/core/src/decl/Kokkos_Declare_SERIAL.hpp index 6095901f05..86b044bee5 100644 --- a/core/src/decl/Kokkos_Declare_SERIAL.hpp +++ b/core/src/decl/Kokkos_Declare_SERIAL.hpp @@ -20,6 +20,7 @@ #if defined(KOKKOS_ENABLE_SERIAL) #include #include +#include #endif #endif diff --git a/core/src/decl/Kokkos_Declare_SYCL.hpp b/core/src/decl/Kokkos_Declare_SYCL.hpp index aa884c1065..a810bb0c76 100644 --- a/core/src/decl/Kokkos_Declare_SYCL.hpp +++ b/core/src/decl/Kokkos_Declare_SYCL.hpp @@ -28,6 +28,7 @@ #include #include #include +#include #endif #endif diff --git a/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp b/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp new file mode 100644 index 0000000000..b373167127 --- /dev/null +++ b/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp @@ -0,0 +1,52 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HOSTSPACE_ZEROMEMSET_HPP +#define KOKKOS_HOSTSPACE_ZEROMEMSET_HPP + +#include +#include +#include + +#include + +namespace Kokkos { +namespace Impl { + +template +struct ZeroMemset> { + ZeroMemset(const HostSpace::execution_space& exec, const View& dst, + typename View::const_value_type&) { + // Host spaces, except for HPX, are synchronous and we need to fence for HPX + // since we can't properly enqueue a std::memset otherwise. + // We can't use exec.fence() directly since we don't have a full definition + // of HostSpace here. + hostspace_fence(exec); + using ValueType = typename View::value_type; + std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); + } + + ZeroMemset(const View& dst, + typename View::const_value_type&) { + using ValueType = typename View::value_type; + std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); + } +}; + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_HOSTSPACE_ZEROMEMSET_HPP diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 232a05bbc4..877004ae4c 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -32,6 +32,7 @@ #include #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -2918,8 +2919,9 @@ struct ViewValueFunctor { "Kokkos::View::initialization [" + name + "] via memset", Kokkos::Profiling::Experimental::device_id(space), &kpID); } - (void)ZeroMemset>( + (void)ZeroMemset< + ExecSpace, Kokkos::View>>( space, Kokkos::View>(ptr, n), @@ -3055,8 +3057,9 @@ struct ViewValueFunctor { Kokkos::Profiling::Experimental::device_id(space), &kpID); } - (void)ZeroMemset>( + (void)ZeroMemset< + ExecSpace, Kokkos::View>>( space, Kokkos::View>(ptr, n), diff --git a/core/src/impl/Kokkos_ZeroMemset_fwd.hpp b/core/src/impl/Kokkos_ZeroMemset_fwd.hpp new file mode 100644 index 0000000000..f36e72e914 --- /dev/null +++ b/core/src/impl/Kokkos_ZeroMemset_fwd.hpp @@ -0,0 +1,29 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_ZEROMEMSET_FWD_HPP +#define KOKKOS_ZEROMEMSET_FWD_HPP + +namespace Kokkos { +namespace Impl { + +template +struct ZeroMemset; + +} // namespace Impl +} // namespace Kokkos + +#endif // #ifndef KOKKOS_ZEROMEMSET_FWD_HPP From 0954a1b0a28063211c02b83adb387c19ef2827d1 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 8 May 2023 18:15:05 -0400 Subject: [PATCH 426/496] Drop CUDA_LAMBDA guards in Cuda headers --- core/src/setup/Kokkos_Setup_Cuda.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/core/src/setup/Kokkos_Setup_Cuda.hpp b/core/src/setup/Kokkos_Setup_Cuda.hpp index c57f690ae1..1130485e84 100644 --- a/core/src/setup/Kokkos_Setup_Cuda.hpp +++ b/core/src/setup/Kokkos_Setup_Cuda.hpp @@ -53,15 +53,9 @@ #error "Cuda device capability >= 3.0 is required." #endif -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA #define KOKKOS_LAMBDA [=] __host__ __device__ - #define KOKKOS_CLASS_LAMBDA [ =, *this ] __host__ __device__ -#else // !defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA -#endif // !defined(KOKKOS_ENABLE_CUDA_LAMBDA) - #define KOKKOS_IMPL_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__ #define KOKKOS_IMPL_FORCEINLINE __forceinline__ #define KOKKOS_IMPL_INLINE_FUNCTION __device__ __host__ inline From 417a6ee735d650bd20116a1e999e6854594e8582 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 5 May 2023 12:07:52 -0600 Subject: [PATCH 427/496] Work around NVHPC 23.x not dealing with __isGlobal --- tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp b/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp index 96e0dfa269..9471862a6b 100644 --- a/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp +++ b/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp @@ -2,10 +2,15 @@ namespace desul { namespace Impl { // Choose the variant of atomics we are using later +// The __isGlobal intrinsic was only introduced in CUDA 11.2 +// It also stopped working in NVC++ 23.1 - it works in 22.11 +// this is a bug in NVHPC, not treating CUDA intrinsics correctly +// FIXME_NVHPC #if !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE) && \ !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL) -#if (__CUDACC_VER_MAJOR__ > 11) || \ - ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ > 1)) +#if ((__CUDACC_VER_MAJOR__ > 11) || \ + ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ > 1))) && \ + (!defined(__NVCOMPILER) || __NVCOMPILER_MAJOR__ < 23) #define DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL #else #define DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE From b82161b2d4797580652a2a3fd9964573e4bc336b Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 8 May 2023 21:54:53 -0400 Subject: [PATCH 428/496] Drop unused cmake macros --- cmake/fake_tribits.cmake | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/cmake/fake_tribits.cmake b/cmake/fake_tribits.cmake index 39822b8aad..5a62fdce6b 100644 --- a/cmake/fake_tribits.cmake +++ b/cmake/fake_tribits.cmake @@ -11,21 +11,6 @@ FUNCTION(ASSERT_DEFINED VARS) ENDFOREACH() ENDFUNCTION() -MACRO(KOKKOS_ADD_OPTION_AND_DEFINE USER_OPTION_NAME MACRO_DEFINE_NAME DOCSTRING DEFAULT_VALUE ) -SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" ) -IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "") - IF(${USER_OPTION_NAME}) - GLOBAL_SET(${MACRO_DEFINE_NAME} ON) - ELSE() - GLOBAL_SET(${MACRO_DEFINE_NAME} OFF) - ENDIF() -ENDIF() -ENDMACRO() - -MACRO(GLOBAL_OVERWRITE VARNAME VALUE TYPE) - SET(${VARNAME} ${VALUE} CACHE ${TYPE} "" FORCE) -ENDMACRO() - IF (NOT KOKKOS_HAS_TRILINOS) MACRO(APPEND_GLOB VAR) FILE(GLOB LOCAL_TMP_VAR ${ARGN}) @@ -40,21 +25,6 @@ MACRO(PREPEND_GLOBAL_SET VARNAME) ASSERT_DEFINED(${VARNAME}) GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) ENDMACRO() - -MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE) - IF(TYPE STREQUAL "REQUIRED") - SET(REQUIRED TRUE) - ELSE() - SET(REQUIRED FALSE) - ENDIF() - IF(TARGET ${TARGET_NAME}) - PREPEND_GLOBAL_SET(${VARNAME} ${TARGET_NAME}) - ELSE() - IF(REQUIRED) - MESSAGE(FATAL_ERROR "Missing dependency ${TARGET_NAME}") - ENDIF() - ENDIF() -ENDMACRO() endif() MACRO(ADD_INTERFACE_LIBRARY LIB_NAME) From ef5d44707e5fb36c634e0ae1ce94035ec1279b07 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 8 May 2023 21:55:10 -0400 Subject: [PATCH 429/496] Fixup cmake style --- cmake/fake_tribits.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/fake_tribits.cmake b/cmake/fake_tribits.cmake index 5a62fdce6b..eedae14d45 100644 --- a/cmake/fake_tribits.cmake +++ b/cmake/fake_tribits.cmake @@ -11,7 +11,7 @@ FUNCTION(ASSERT_DEFINED VARS) ENDFOREACH() ENDFUNCTION() -IF (NOT KOKKOS_HAS_TRILINOS) +IF(NOT KOKKOS_HAS_TRILINOS) MACRO(APPEND_GLOB VAR) FILE(GLOB LOCAL_TMP_VAR ${ARGN}) LIST(APPEND ${VAR} ${LOCAL_TMP_VAR}) @@ -25,7 +25,7 @@ MACRO(PREPEND_GLOBAL_SET VARNAME) ASSERT_DEFINED(${VARNAME}) GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) ENDMACRO() -endif() +ENDIF() MACRO(ADD_INTERFACE_LIBRARY LIB_NAME) FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") From 81ce338ac8f7fd63d5cf0648263356923f5cbc70 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Tue, 9 May 2023 12:57:21 +0200 Subject: [PATCH 430/496] use ASSERT_EQ in all std algorithms tests --- .../unit_tests/TestRandomAccessIterator.cpp | 22 ++--- .../TestStdAlgorithmsAdjacentDifference.cpp | 10 +-- .../TestStdAlgorithmsAdjacentFind.cpp | 2 +- .../unit_tests/TestStdAlgorithmsCommon.hpp | 8 +- .../TestStdAlgorithmsConstraints.cpp | 4 +- .../unit_tests/TestStdAlgorithmsCopyIf.cpp | 68 +++++++-------- .../unit_tests/TestStdAlgorithmsCount.cpp | 20 ++--- .../TestStdAlgorithmsExclusiveScan.cpp | 38 ++++---- .../unit_tests/TestStdAlgorithmsFind.cpp | 30 +++---- .../unit_tests/TestStdAlgorithmsFindEnd.cpp | 8 +- .../TestStdAlgorithmsFindFirstOf.cpp | 8 +- .../unit_tests/TestStdAlgorithmsForEach.cpp | 8 +- .../TestStdAlgorithmsInclusiveScan.cpp | 38 ++++---- .../TestStdAlgorithmsIsSortedUntil.cpp | 16 ++-- ...estStdAlgorithmsLexicographicalCompare.cpp | 20 ++--- .../TestStdAlgorithmsMinMaxElementOps.cpp | 20 ++--- .../unit_tests/TestStdAlgorithmsMismatch.cpp | 16 ++-- .../unit_tests/TestStdAlgorithmsModOps.cpp | 34 ++++---- .../unit_tests/TestStdAlgorithmsModSeqOps.cpp | 86 +++++++++---------- .../TestStdAlgorithmsMoveBackward.cpp | 8 +- .../unit_tests/TestStdAlgorithmsNumerics.cpp | 72 ++++++++-------- .../TestStdAlgorithmsPartitionCopy.cpp | 36 ++++---- .../TestStdAlgorithmsPartitioningOps.cpp | 18 ++-- .../unit_tests/TestStdAlgorithmsRemove.cpp | 4 +- .../TestStdAlgorithmsRemoveCopy.cpp | 4 +- .../TestStdAlgorithmsRemoveCopyIf.cpp | 4 +- .../unit_tests/TestStdAlgorithmsRemoveIf.cpp | 4 +- .../unit_tests/TestStdAlgorithmsReplace.cpp | 24 +++--- .../TestStdAlgorithmsReplaceCopy.cpp | 56 ++++++------ .../TestStdAlgorithmsReplaceCopyIf.cpp | 56 ++++++------ .../unit_tests/TestStdAlgorithmsReplaceIf.cpp | 2 +- .../unit_tests/TestStdAlgorithmsReverse.cpp | 2 +- .../unit_tests/TestStdAlgorithmsRotate.cpp | 4 +- .../TestStdAlgorithmsRotateCopy.cpp | 10 +-- .../unit_tests/TestStdAlgorithmsSearch.cpp | 8 +- .../unit_tests/TestStdAlgorithmsSearch_n.cpp | 8 +- .../unit_tests/TestStdAlgorithmsShiftLeft.cpp | 4 +- .../TestStdAlgorithmsShiftRight.cpp | 4 +- ...estStdAlgorithmsTransformExclusiveScan.cpp | 30 +++---- ...estStdAlgorithmsTransformInclusiveScan.cpp | 30 +++---- .../TestStdAlgorithmsTransformUnaryOp.cpp | 10 +-- .../unit_tests/TestStdAlgorithmsUnique.cpp | 10 +-- .../TestStdAlgorithmsUniqueCopy.cpp | 70 +++++++-------- 43 files changed, 467 insertions(+), 467 deletions(-) diff --git a/algorithms/unit_tests/TestRandomAccessIterator.cpp b/algorithms/unit_tests/TestRandomAccessIterator.cpp index 439d171c8a..fd3a875b1e 100644 --- a/algorithms/unit_tests/TestRandomAccessIterator.cpp +++ b/algorithms/unit_tests/TestRandomAccessIterator.cpp @@ -54,7 +54,7 @@ void test_random_access_it_verify(IteratorType it, ValueType gold_value) { Kokkos::parallel_for("_std_algo_copy", 1, cf); auto v_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), checkView); - EXPECT_EQ(v_h(), gold_value); + ASSERT_EQ(v_h(), gold_value); } TEST_F(random_access_iterator_test, dereference) { @@ -96,9 +96,9 @@ void test_random_access_it_subscript_op_verify(IteratorType it) { auto v_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), checkView); - EXPECT_EQ(v_h(0), (value_t)0); - EXPECT_EQ(v_h(1), (value_t)1); - EXPECT_EQ(v_h(2), (value_t)2); + ASSERT_EQ(v_h(0), (value_t)0); + ASSERT_EQ(v_h(1), (value_t)1); + ASSERT_EQ(v_h(2), (value_t)2); } TEST_F(random_access_iterator_test, subscript_operator) { @@ -188,9 +188,9 @@ TEST_F(random_access_iterator_test, operatorsSet4) { auto it7 = KE::Impl::RandomAccessIterator(m_static_view, 3); auto it8 = KE::Impl::RandomAccessIterator(m_dynamic_view, 3); auto it9 = KE::Impl::RandomAccessIterator(m_strided_view, 3); - EXPECT_EQ(it1, it7); - EXPECT_EQ(it2, it8); - EXPECT_EQ(it3, it9); + ASSERT_EQ(it1, it7); + ASSERT_EQ(it2, it8); + ASSERT_EQ(it3, it9); EXPECT_GE(it1, it7); EXPECT_GE(it2, it8); EXPECT_GE(it3, it9); @@ -205,16 +205,16 @@ TEST_F(random_access_iterator_test, assignment_operator) { EXPECT_NE(it1, it2); it2 = it1; - EXPECT_EQ(it1, it2); + ASSERT_EQ(it1, it2); } TEST_F(random_access_iterator_test, distance) { auto first = KE::begin(m_dynamic_view); auto last = KE::end(m_dynamic_view); - EXPECT_EQ(0, KE::distance(first, first)); - EXPECT_EQ(1, KE::distance(first, first + 1)); - EXPECT_EQ(m_dynamic_view.extent(0), size_t(KE::distance(first, last))); + ASSERT_EQ(0, KE::distance(first, first)); + ASSERT_EQ(1, KE::distance(first, first + 1)); + ASSERT_EQ(m_dynamic_view.extent(0), size_t(KE::distance(first, last))); } } // namespace stdalgos diff --git a/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp index d414d524b6..75ad533f6e 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp @@ -157,7 +157,7 @@ void verify_data(TestViewType test_view, GoldViewType gold) { const auto gold_h = create_mirror_view_and_copy(Kokkos::HostSpace(), gold); for (std::size_t i = 0; i < test_view.extent(0); ++i) { - EXPECT_EQ(gold_h(i), test_view_dc_h(i)); + ASSERT_EQ(gold_h(i), test_view_dc_h(i)); } } @@ -197,7 +197,7 @@ void run_single_scenario(const InfoType& scenario_info, auto res1 = KE::adjacent_difference(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), args...); - EXPECT_EQ(res1, KE::end(view_dest)); + ASSERT_EQ(res1, KE::end(view_dest)); verify_data(view_dest, gold); } @@ -207,7 +207,7 @@ void run_single_scenario(const InfoType& scenario_info, auto res2 = KE::adjacent_difference( "label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), args...); - EXPECT_EQ(res2, KE::end(view_dest)); + ASSERT_EQ(res2, KE::end(view_dest)); verify_data(view_dest, gold); } @@ -216,7 +216,7 @@ void run_single_scenario(const InfoType& scenario_info, create_view(Tag{}, view_ext, "adj_diff_dest_view"); auto res3 = KE::adjacent_difference(exespace(), view_from, view_dest, args...); - EXPECT_EQ(res3, KE::end(view_dest)); + ASSERT_EQ(res3, KE::end(view_dest)); verify_data(view_dest, gold); } @@ -225,7 +225,7 @@ void run_single_scenario(const InfoType& scenario_info, create_view(Tag{}, view_ext, "adj_diff_dest_view"); auto res4 = KE::adjacent_difference("label", exespace(), view_from, view_dest, args...); - EXPECT_EQ(res4, KE::end(view_dest)); + ASSERT_EQ(res4, KE::end(view_dest)); verify_data(view_dest, gold); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp b/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp index 6fc9d583f3..fa4ff48dbe 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp @@ -229,7 +229,7 @@ void verify(DiffType my_diff, ViewType view, Args... args) { my_std_adjacent_find(KE::cbegin(view_h), KE::cend(view_h), args...); const auto std_diff = std_r - KE::cbegin(view_h); - EXPECT_EQ(my_diff, std_diff); + ASSERT_EQ(my_diff, std_diff); } template diff --git a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index 694676a878..5b30b9eda7 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -110,7 +110,7 @@ verify_values(ValueType expected, const ViewType view) { "Non-matching value types of view and reference value"); auto view_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view); for (std::size_t i = 0; i < view_h.extent(0); i++) { - EXPECT_EQ(expected, view_h(i)); + ASSERT_EQ(expected, view_h(i)); } } @@ -130,7 +130,7 @@ verify_values(ValueType expected, const ViewType view) { auto view_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), tmpView); for (std::size_t i = 0; i < view_h.extent(0); i++) { - EXPECT_EQ(expected, view_h(i)); + ASSERT_EQ(expected, view_h(i)); } } @@ -147,7 +147,7 @@ compare_views(ViewType1 expected, const ViewType2 actual) { Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), actual); for (std::size_t i = 0; i < expected_h.extent(0); i++) { - EXPECT_EQ(expected_h(i), actual_h(i)); + ASSERT_EQ(expected_h(i), actual_h(i)); } } @@ -171,7 +171,7 @@ compare_views(ViewType1 expected, const ViewType2 actual) { Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), expected); for (std::size_t i = 0; i < expected_h.extent(0); i++) { - EXPECT_EQ(expected_h(i), actual_h(i)); + ASSERT_EQ(expected_h(i), actual_h(i)); } } diff --git a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 5d55199801..386d533f7a 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -42,8 +42,8 @@ TEST(std_algorithms, is_admissible_to_std_algorithms) { using strided_view_1d_t = Kokkos::View; Kokkos::LayoutStride layout1d{extent0, 2}; strided_view_1d_t strided_view_1d{"std-algo-test-1d-strided-view", layout1d}; - EXPECT_EQ(layout1d.dimension[0], 13u); - EXPECT_EQ(layout1d.stride[0], 2u); + ASSERT_EQ(layout1d.dimension[0], 13u); + ASSERT_EQ(layout1d.stride[0], 2u); // they are admissible KE::Impl::static_assert_is_admissible_to_kokkos_std_algorithms( static_view_1d); diff --git a/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp index e21d50f69b..5778e37be0 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp @@ -135,49 +135,49 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else if (name == "one-element-a") { - EXPECT_EQ(view_test_h(0), static_cast(0)); + ASSERT_EQ(view_test_h(0), static_cast(0)); } else if (name == "one-element-b") { - EXPECT_EQ(view_test_h(0), static_cast(2)); + ASSERT_EQ(view_test_h(0), static_cast(2)); } else if (name == "two-elements-a") { - EXPECT_EQ(view_test_h(0), static_cast(2)); - EXPECT_EQ(view_test_h(1), static_cast(0)); + ASSERT_EQ(view_test_h(0), static_cast(2)); + ASSERT_EQ(view_test_h(1), static_cast(0)); } else if (name == "two-elements-b") { - EXPECT_EQ(view_test_h(0), static_cast(2)); - EXPECT_EQ(view_test_h(1), static_cast(0)); + ASSERT_EQ(view_test_h(0), static_cast(2)); + ASSERT_EQ(view_test_h(1), static_cast(0)); } else if (name == "small-a") { - EXPECT_EQ(view_test_h(0), static_cast(-4)); - EXPECT_EQ(view_test_h(1), static_cast(-2)); - EXPECT_EQ(view_test_h(2), static_cast(0)); - EXPECT_EQ(view_test_h(3), static_cast(2)); - EXPECT_EQ(view_test_h(4), static_cast(4)); - EXPECT_EQ(view_test_h(5), static_cast(0)); - EXPECT_EQ(view_test_h(6), static_cast(0)); - EXPECT_EQ(view_test_h(7), static_cast(0)); - EXPECT_EQ(view_test_h(8), static_cast(0)); + ASSERT_EQ(view_test_h(0), static_cast(-4)); + ASSERT_EQ(view_test_h(1), static_cast(-2)); + ASSERT_EQ(view_test_h(2), static_cast(0)); + ASSERT_EQ(view_test_h(3), static_cast(2)); + ASSERT_EQ(view_test_h(4), static_cast(4)); + ASSERT_EQ(view_test_h(5), static_cast(0)); + ASSERT_EQ(view_test_h(6), static_cast(0)); + ASSERT_EQ(view_test_h(7), static_cast(0)); + ASSERT_EQ(view_test_h(8), static_cast(0)); } else if (name == "small-b") { - EXPECT_EQ(view_test_h(0), static_cast(22)); - EXPECT_EQ(view_test_h(1), static_cast(-12)); - EXPECT_EQ(view_test_h(2), static_cast(22)); - EXPECT_EQ(view_test_h(3), static_cast(-12)); - EXPECT_EQ(view_test_h(4), static_cast(22)); - EXPECT_EQ(view_test_h(5), static_cast(-12)); - EXPECT_EQ(view_test_h(6), static_cast(22)); - EXPECT_EQ(view_test_h(7), static_cast(-12)); - EXPECT_EQ(view_test_h(8), static_cast(22)); - EXPECT_EQ(view_test_h(9), static_cast(-12)); - EXPECT_EQ(view_test_h(10), static_cast(22)); - EXPECT_EQ(view_test_h(11), static_cast(-12)); - EXPECT_EQ(view_test_h(12), static_cast(22)); + ASSERT_EQ(view_test_h(0), static_cast(22)); + ASSERT_EQ(view_test_h(1), static_cast(-12)); + ASSERT_EQ(view_test_h(2), static_cast(22)); + ASSERT_EQ(view_test_h(3), static_cast(-12)); + ASSERT_EQ(view_test_h(4), static_cast(22)); + ASSERT_EQ(view_test_h(5), static_cast(-12)); + ASSERT_EQ(view_test_h(6), static_cast(22)); + ASSERT_EQ(view_test_h(7), static_cast(-12)); + ASSERT_EQ(view_test_h(8), static_cast(22)); + ASSERT_EQ(view_test_h(9), static_cast(-12)); + ASSERT_EQ(view_test_h(10), static_cast(22)); + ASSERT_EQ(view_test_h(11), static_cast(-12)); + ASSERT_EQ(view_test_h(12), static_cast(22)); } else if (name == "medium" || name == "large") { @@ -190,14 +190,14 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, std::size_t count = 0; for (std::size_t i = 0; i < view_from_h.extent(0); ++i) { if (pred(view_from_h(i))) { - EXPECT_EQ(view_test_h(count), view_from_h(i)); + ASSERT_EQ(view_test_h(count), view_from_h(i)); count++; } } // all other entries of test view should be zero for (; count < view_test_h.extent(0); ++count) { // std::cout << count << '\n'; - EXPECT_EQ(view_test_h(count), value_type(0)); + ASSERT_EQ(view_test_h(count), value_type(0)); } } @@ -226,7 +226,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto rit = KE::copy_if(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); - EXPECT_EQ(rit, (KE::begin(view_dest) + n)); + ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } { @@ -235,7 +235,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto rit = KE::copy_if("label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); - EXPECT_EQ(rit, (KE::begin(view_dest) + n)); + ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } { @@ -243,7 +243,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto view_dest = create_view(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if(exespace(), view_from, view_dest, pred); verify_data(name, view_from, view_dest, pred); - EXPECT_EQ(rit, (KE::begin(view_dest) + n)); + ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } { @@ -251,7 +251,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto view_dest = create_view(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if("label", exespace(), view_from, view_dest, pred); verify_data(name, view_from, view_dest, pred); - EXPECT_EQ(rit, (KE::begin(view_dest) + n)); + ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } Kokkos::fence(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsCount.cpp b/algorithms/unit_tests/TestStdAlgorithmsCount.cpp index 9423d2e15a..32e9883709 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsCount.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsCount.cpp @@ -35,13 +35,13 @@ void test_count(const ViewType view) { const value_t count_value = 0; const auto std_result = std::count(KE::cbegin(expected), KE::cend(expected), count_value); - EXPECT_EQ(view.extent(0), size_t(std_result)); + ASSERT_EQ(view.extent(0), size_t(std_result)); // pass const iterators - EXPECT_EQ(std_result, KE::count(exespace(), KE::cbegin(view), + ASSERT_EQ(std_result, KE::count(exespace(), KE::cbegin(view), KE::cend(view), count_value)); // pass view - EXPECT_EQ(std_result, KE::count(exespace(), view, count_value)); + ASSERT_EQ(std_result, KE::count(exespace(), view, count_value)); } { @@ -50,10 +50,10 @@ void test_count(const ViewType view) { std::count(KE::cbegin(expected), KE::cend(expected), count_value); // pass iterators - EXPECT_EQ(std_result, KE::count("label", exespace(), KE::begin(view), + ASSERT_EQ(std_result, KE::count("label", exespace(), KE::begin(view), KE::end(view), count_value)); // pass view - EXPECT_EQ(std_result, KE::count("label", exespace(), view, count_value)); + ASSERT_EQ(std_result, KE::count("label", exespace(), view, count_value)); } } @@ -67,24 +67,24 @@ void test_count_if(const ViewType view) { // no positive elements (all zeroes) const auto predicate = IsPositiveFunctor(); - EXPECT_EQ(0, + ASSERT_EQ(0, std::count_if(KE::begin(expected), KE::end(expected), predicate)); // pass iterators - EXPECT_EQ( + ASSERT_EQ( 0, KE::count_if(exespace(), KE::begin(view), KE::end(view), predicate)); // pass view - EXPECT_EQ(0, KE::count_if(exespace(), view, predicate)); + ASSERT_EQ(0, KE::count_if(exespace(), view, predicate)); fill_views_inc(view, expected); const auto std_result = std::count_if(KE::begin(expected), KE::end(expected), predicate); // pass const iterators - EXPECT_EQ(std_result, KE::count_if("label", exespace(), KE::cbegin(view), + ASSERT_EQ(std_result, KE::count_if("label", exespace(), KE::cbegin(view), KE::cend(view), predicate)); // pass view - EXPECT_EQ(std_result, KE::count_if("label", exespace(), view, predicate)); + ASSERT_EQ(std_result, KE::count_if("label", exespace(), view, predicate)); } template diff --git a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index 8cd6097ee6..799de8b0c4 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -157,7 +157,7 @@ void verify_data(ViewType1 data_view, // contains data // << gold_h(i) << " " << test_view_h(i) << " " // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; if (std::is_same::value) { - EXPECT_EQ(gold_h(i), test_view_h(i)); + ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(static_cast(gold_h(i) - test_view_h(i))); @@ -213,7 +213,7 @@ void run_single_scenario_default_op(const InfoType& scenario_info, auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), init_value); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, init_value, default_op()); } @@ -222,14 +222,14 @@ void run_single_scenario_default_op(const InfoType& scenario_info, auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), init_value); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, init_value, default_op()); } { fill_zero(view_dest); auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, init_value, default_op()); } @@ -237,7 +237,7 @@ void run_single_scenario_default_op(const InfoType& scenario_info, fill_zero(view_dest); auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest, init_value); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, init_value, default_op()); } @@ -263,7 +263,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), init_value, bop); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, init_value, bop); } @@ -272,7 +272,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), init_value, bop); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, init_value, bop); } @@ -280,7 +280,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, fill_zero(view_dest); auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value, bop); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, init_value, bop); } @@ -288,7 +288,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, fill_zero(view_dest); auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest, init_value, bop); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, init_value, bop); } @@ -355,33 +355,33 @@ TEST(std_algorithms_numeric_ops_test, exclusive_scan_functor) { value_type value1; functor.init(value1); - EXPECT_EQ(value1.val, 0); - EXPECT_EQ(value1.is_initial, true); + ASSERT_EQ(value1.val, 0); + ASSERT_EQ(value1.is_initial, true); value_type value2; value2.val = 1; value2.is_initial = false; functor.join(value1, value2); - EXPECT_EQ(value1.val, 1); - EXPECT_EQ(value1.is_initial, false); + ASSERT_EQ(value1.val, 1); + ASSERT_EQ(value1.is_initial, false); functor.init(value1); functor.join(value2, value1); - EXPECT_EQ(value2.val, 1); - EXPECT_EQ(value2.is_initial, false); + ASSERT_EQ(value2.val, 1); + ASSERT_EQ(value2.is_initial, false); functor.init(value2); functor.join(value2, value1); - EXPECT_EQ(value2.val, 0); - EXPECT_EQ(value2.is_initial, true); + ASSERT_EQ(value2.val, 0); + ASSERT_EQ(value2.is_initial, true); value1.val = 1; value1.is_initial = false; value2.val = 2; value2.is_initial = false; functor.join(value2, value1); - EXPECT_EQ(value2.val, 3); - EXPECT_EQ(value2.is_initial, false); + ASSERT_EQ(value2.val, 3); + ASSERT_EQ(value2.is_initial, false); } } // namespace EScan diff --git a/algorithms/unit_tests/TestStdAlgorithmsFind.cpp b/algorithms/unit_tests/TestStdAlgorithmsFind.cpp index 5407bab224..2692df6982 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsFind.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsFind.cpp @@ -34,14 +34,14 @@ void test_find(const ViewType view) { constexpr value_t find_value = 13; // value not found, return last - EXPECT_EQ(KE::end(expected), + ASSERT_EQ(KE::end(expected), std::find(KE::begin(expected), KE::end(expected), find_value)); // pass const iterators, returns const iterator - EXPECT_EQ(KE::cend(view), + ASSERT_EQ(KE::cend(view), KE::find(exespace(), KE::cbegin(view), KE::cend(view), find_value)); // pass view, returns iterator - EXPECT_EQ(KE::end(view), KE::find(exespace(), view, find_value)); + ASSERT_EQ(KE::end(view), KE::find(exespace(), view, find_value)); fill_views_inc(view, expected); @@ -50,10 +50,10 @@ void test_find(const ViewType view) { auto distance = std::distance(KE::begin(expected), std_result); // pass iterators, returns iterator - EXPECT_EQ(KE::begin(view) + distance, + ASSERT_EQ(KE::begin(view) + distance, KE::find(exespace(), KE::begin(view), KE::end(view), find_value)); // pass view, returns iterator - EXPECT_EQ(KE::begin(view) + distance, KE::find(exespace(), view, find_value)); + ASSERT_EQ(KE::begin(view) + distance, KE::find(exespace(), view, find_value)); } template @@ -67,15 +67,15 @@ void test_find_if(const ViewType view) { const auto not_equals_zero = NotEqualsZeroFunctor(); // value not found, return last - EXPECT_EQ( + ASSERT_EQ( KE::end(expected), std::find_if(KE::begin(expected), KE::end(expected), not_equals_zero)); // pass iterators, returns iterator - EXPECT_EQ(KE::end(view), KE::find_if(exespace(), KE::begin(view), + ASSERT_EQ(KE::end(view), KE::find_if(exespace(), KE::begin(view), KE::end(view), not_equals_zero)); // pass view, returns iterator - EXPECT_EQ(KE::end(view), KE::find_if(exespace(), view, not_equals_zero)); + ASSERT_EQ(KE::end(view), KE::find_if(exespace(), view, not_equals_zero)); fill_views_inc(view, expected); @@ -86,11 +86,11 @@ void test_find_if(const ViewType view) { auto distance = std::distance(KE::begin(expected), std_result); // pass const iterators, returns const iterator - EXPECT_EQ( + ASSERT_EQ( KE::cbegin(view) + distance, KE::find_if(exespace(), KE::cbegin(view), KE::cend(view), equals_val)); // pass view, returns iterator - EXPECT_EQ(KE::begin(view) + distance, + ASSERT_EQ(KE::begin(view) + distance, KE::find_if(exespace(), view, equals_val)); } @@ -105,15 +105,15 @@ void test_find_if_not(const ViewType view) { const auto not_equals_zero = NotEqualsZeroFunctor(); // first value matches - EXPECT_EQ(KE::begin(expected), + ASSERT_EQ(KE::begin(expected), std::find_if_not(KE::begin(expected), KE::end(expected), not_equals_zero)); // pass iterators, returns iterator - EXPECT_EQ(KE::begin(view), KE::find_if_not(exespace(), KE::begin(view), + ASSERT_EQ(KE::begin(view), KE::find_if_not(exespace(), KE::begin(view), KE::end(view), not_equals_zero)); // pass view, returns iterator - EXPECT_EQ(KE::begin(view), + ASSERT_EQ(KE::begin(view), KE::find_if_not(exespace(), view, not_equals_zero)); fill_views_inc(view, expected); @@ -124,11 +124,11 @@ void test_find_if_not(const ViewType view) { auto distance = std::distance(KE::begin(expected), std_result); // pass const iterators, returns const iterator - EXPECT_EQ(KE::cbegin(view) + distance, + ASSERT_EQ(KE::cbegin(view) + distance, KE::find_if_not(exespace(), KE::cbegin(view), KE::cend(view), equals_zero)); // pass view, returns const iterator - EXPECT_EQ(KE::begin(view) + distance, + ASSERT_EQ(KE::begin(view) + distance, KE::find_if_not(exespace(), view, equals_zero)); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp b/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp index c9e213962b..5a5359b0b2 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp @@ -282,7 +282,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext, const auto mydiff = myrit - KE::cbegin(view); const auto stddiff = stdrit - KE::cbegin(view_h); // std::cout << "result : " << mydiff << " " << stddiff << std::endl; - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } { @@ -291,21 +291,21 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext, KE::cbegin(s_view), KE::cend(s_view), args...); const auto mydiff = myrit - KE::cbegin(view); const auto stddiff = stdrit - KE::cbegin(view_h); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } { auto myrit = KE::find_end(exespace(), view, s_view, args...); const auto mydiff = myrit - KE::begin(view); const auto stddiff = stdrit - KE::cbegin(view_h); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } { auto myrit = KE::find_end("label", exespace(), view, s_view, args...); const auto mydiff = myrit - KE::begin(view); const auto stddiff = stdrit - KE::cbegin(view_h); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } Kokkos::fence(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp b/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp index e9141bd27b..d77edb5fed 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp @@ -201,7 +201,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext, KE::cbegin(s_view), KE::cend(s_view), args...); const auto mydiff = myrit - KE::cbegin(view); const auto stddiff = stdrit - KE::cbegin(view_h); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } { @@ -210,21 +210,21 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext, KE::cbegin(s_view), KE::cend(s_view), args...); const auto mydiff = myrit - KE::cbegin(view); const auto stddiff = stdrit - KE::cbegin(view_h); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } { auto myrit = KE::find_first_of(exespace(), view, s_view, args...); const auto mydiff = myrit - KE::begin(view); const auto stddiff = stdrit - KE::cbegin(view_h); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } { auto myrit = KE::find_first_of("label", exespace(), view, s_view, args...); const auto mydiff = myrit - KE::begin(view); const auto stddiff = stdrit - KE::cbegin(view_h); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } Kokkos::fence(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp b/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp index 83b44f01aa..793b98a67f 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp @@ -91,23 +91,23 @@ void test_for_each_n(const ViewType view) { const auto non_mod_functor = NoOpNonMutableFunctor(); // pass const iterators, functor takes const ref - EXPECT_EQ(KE::cbegin(view) + n, + ASSERT_EQ(KE::cbegin(view) + n, KE::for_each_n(exespace(), KE::cbegin(view), n, non_mod_functor)); verify_values(value_t{0}, view); // pass view, functor takes const ref - EXPECT_EQ(KE::begin(view) + n, + ASSERT_EQ(KE::begin(view) + n, KE::for_each_n(exespace(), view, n, non_mod_functor)); verify_values(value_t{0}, view); // pass iterators, functor takes non-const ref const auto mod_functor = IncrementElementWiseFunctor(); - EXPECT_EQ(KE::begin(view) + n, + ASSERT_EQ(KE::begin(view) + n, KE::for_each_n(exespace(), KE::begin(view), n, mod_functor)); verify_values(value_t{1}, view); // pass view, functor takes non-const ref - EXPECT_EQ(KE::begin(view) + n, + ASSERT_EQ(KE::begin(view) + n, KE::for_each_n("label", exespace(), view, n, mod_functor)); verify_values(value_t{2}, view); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index 7ddc142ad5..8e60a43e5f 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -171,7 +171,7 @@ void verify_data(ViewType1 data_view, // contains data // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; if (std::is_same::value) { - EXPECT_EQ(gold_h(i), test_view_h(i)); + ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(static_cast(gold_h(i) - test_view_h(i))); @@ -224,7 +224,7 @@ void run_single_scenario_default_op(const InfoType& scenario_info) { fill_zero(view_dest); auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest)); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, default_op()); } @@ -232,21 +232,21 @@ void run_single_scenario_default_op(const InfoType& scenario_info) { fill_zero(view_dest); auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest)); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, default_op()); } { fill_zero(view_dest); auto r = KE::inclusive_scan(exespace(), view_from, view_dest); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, default_op()); } { fill_zero(view_dest); auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, default_op()); } @@ -279,7 +279,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop, auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), bop, args...); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, bop, args...); } @@ -288,14 +288,14 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop, auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), bop, args...); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, bop, args...); } { fill_zero(view_dest); auto r = KE::inclusive_scan(exespace(), view_from, view_dest, bop, args...); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, bop, args...); } @@ -303,7 +303,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop, fill_zero(view_dest); auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest, bop, args...); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, bop, args...); } @@ -363,33 +363,33 @@ TEST(std_algorithms_numeric_ops_test, inclusive_scan_functor) { value_type value1; functor.init(value1); - EXPECT_EQ(value1.val, 0); - EXPECT_EQ(value1.is_initial, true); + ASSERT_EQ(value1.val, 0); + ASSERT_EQ(value1.is_initial, true); value_type value2; value2.val = 1; value2.is_initial = false; functor.join(value1, value2); - EXPECT_EQ(value1.val, 1); - EXPECT_EQ(value1.is_initial, false); + ASSERT_EQ(value1.val, 1); + ASSERT_EQ(value1.is_initial, false); functor.init(value1); functor.join(value2, value1); - EXPECT_EQ(value2.val, 1); - EXPECT_EQ(value2.is_initial, false); + ASSERT_EQ(value2.val, 1); + ASSERT_EQ(value2.is_initial, false); functor.init(value2); functor.join(value2, value1); - EXPECT_EQ(value2.val, 0); - EXPECT_EQ(value2.is_initial, true); + ASSERT_EQ(value2.val, 0); + ASSERT_EQ(value2.is_initial, true); value1.val = 1; value1.is_initial = false; value2.val = 2; value2.is_initial = false; functor.join(value2, value1); - EXPECT_EQ(value2.val, 3); - EXPECT_EQ(value2.is_initial, false); + ASSERT_EQ(value2.val, 3); + ASSERT_EQ(value2.is_initial, false); } } // namespace IncScan diff --git a/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index 6053c6ca57..dcfe8ad67e 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -145,10 +145,10 @@ void run_single_scenario(const InfoType& scenario_info) { KE::is_sorted_until("label", exespace(), KE::begin(view), KE::end(view)); auto r3 = KE::is_sorted_until(exespace(), view); auto r4 = KE::is_sorted_until("label", exespace(), view); - EXPECT_EQ(r1, gold); - EXPECT_EQ(r2, gold); - EXPECT_EQ(r3, gold); - EXPECT_EQ(r4, gold); + ASSERT_EQ(r1, gold); + ASSERT_EQ(r2, gold); + ASSERT_EQ(r3, gold); + ASSERT_EQ(r4, gold); #if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator comp; @@ -160,10 +160,10 @@ void run_single_scenario(const InfoType& scenario_info) { auto r8 = KE::is_sorted_until("label", exespace(), view, comp); #endif - EXPECT_EQ(r1, gold); - EXPECT_EQ(r2, gold); - EXPECT_EQ(r3, gold); - EXPECT_EQ(r4, gold); + ASSERT_EQ(r1, gold); + ASSERT_EQ(r2, gold); + ASSERT_EQ(r3, gold); + ASSERT_EQ(r4, gold); Kokkos::fence(); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp b/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp index 2d4f1afdd0..5d9e7db803 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp @@ -44,16 +44,16 @@ void test_lexicographical_compare(const ViewType1 view_1, ViewType2 view_2) { std::lexicographical_compare(h_first_1, h_last_1, h_first_2, h_last_2); // pass iterators - EXPECT_EQ(std_result, KE::lexicographical_compare(exespace(), first_1, + ASSERT_EQ(std_result, KE::lexicographical_compare(exespace(), first_1, last_1, first_2, last_2)); - EXPECT_EQ(std_result, + ASSERT_EQ(std_result, KE::lexicographical_compare("label", exespace(), first_1, last_1, first_2, last_2)); // pass views - EXPECT_EQ(std_result, + ASSERT_EQ(std_result, KE::lexicographical_compare(exespace(), view_1, view_2)); - EXPECT_EQ(std_result, + ASSERT_EQ(std_result, KE::lexicographical_compare("label", exespace(), view_1, view_2)); } @@ -67,17 +67,17 @@ void test_lexicographical_compare(const ViewType1 view_1, ViewType2 view_2) { h_first_1, h_last_1, h_first_2, h_last_2, custom_comparator); // pass iterators - EXPECT_EQ(std_result, + ASSERT_EQ(std_result, KE::lexicographical_compare(exespace(), first_1, last_1, first_2, last_2, custom_comparator)); - EXPECT_EQ(std_result, + ASSERT_EQ(std_result, KE::lexicographical_compare("label", exespace(), first_1, last_1, first_2, last_2, custom_comparator)); // pass views - EXPECT_EQ(std_result, KE::lexicographical_compare( + ASSERT_EQ(std_result, KE::lexicographical_compare( exespace(), view_1, view_2, custom_comparator)); - EXPECT_EQ(std_result, + ASSERT_EQ(std_result, KE::lexicographical_compare("label", exespace(), view_1, view_2, custom_comparator)); } @@ -86,7 +86,7 @@ void test_lexicographical_compare(const ViewType1 view_1, ViewType2 view_2) { // empty vs non-empty auto std_result = std::lexicographical_compare(h_first_1, h_first_1, h_first_2, h_last_2); - EXPECT_EQ(std_result, KE::lexicographical_compare( + ASSERT_EQ(std_result, KE::lexicographical_compare( exespace(), first_1, first_1, first_2, last_2)); } @@ -95,7 +95,7 @@ void test_lexicographical_compare(const ViewType1 view_1, ViewType2 view_2) { if (view_1.extent(0) > 1) { auto std_result = std::lexicographical_compare(h_first_1, h_last_1 - 1, h_first_2, h_last_2); - EXPECT_EQ(std_result, + ASSERT_EQ(std_result, KE::lexicographical_compare(exespace(), first_1, last_1 - 1, first_2, last_2)); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp b/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp index f8634ffafe..bc43231784 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp @@ -173,7 +173,7 @@ void std_algo_min_max_test_verify(Kokkos::pair goldPair, const ItType result, TestedViewType testedView) { // check that iterator is pointing to right element - EXPECT_EQ(result - KE::begin(testedView), goldPair.first); + ASSERT_EQ(result - KE::begin(testedView), goldPair.first); // create a view for the result to copy into it the iterator's value using result_view_t = Kokkos::View; @@ -184,7 +184,7 @@ void std_algo_min_max_test_verify(Kokkos::pair goldPair, Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), resultView); // use the host mirror of the result view to check that the values match - EXPECT_EQ(result_v_h(), goldPair.second); + ASSERT_EQ(result_v_h(), goldPair.second); } template @@ -199,39 +199,39 @@ template void test_max_element_trivial_data(ViewType view) { /* if we pass empty range, should return last */ auto result = KE::max_element(exespace(), KE::cbegin(view), KE::cbegin(view)); - EXPECT_EQ(result, KE::cbegin(view)); + ASSERT_EQ(result, KE::cbegin(view)); /* if we pass empty range, should return last */ auto it0 = KE::cbegin(view) + 3; auto it1 = it0; auto result2 = KE::max_element(exespace(), it0, it1); - EXPECT_EQ(result2, it1); + ASSERT_EQ(result2, it1); } template void test_min_element_trivial_data(ViewType view) { /* if we pass empty range, should return last */ auto result = KE::min_element(exespace(), KE::cbegin(view), KE::cbegin(view)); - EXPECT_EQ(result, KE::cbegin(view)); + ASSERT_EQ(result, KE::cbegin(view)); /* if we pass empty range, should return last */ auto it0 = KE::cbegin(view) + 3; auto it1 = it0; auto result2 = KE::min_element(exespace(), it0, it1); - EXPECT_EQ(result2, it1); + ASSERT_EQ(result2, it1); } template void test_minmax_element_empty_range(ViewType view) { auto result = KE::minmax_element(exespace(), KE::cbegin(view), KE::cbegin(view)); - EXPECT_EQ(result.first, KE::cbegin(view)); - EXPECT_EQ(result.second, KE::cbegin(view)); + ASSERT_EQ(result.first, KE::cbegin(view)); + ASSERT_EQ(result.second, KE::cbegin(view)); auto it0 = KE::cbegin(view) + 3; auto it1 = it0; auto result2 = KE::minmax_element(exespace(), it0, it1); - EXPECT_EQ(result2.first, it1); - EXPECT_EQ(result2.second, it1); + ASSERT_EQ(result2.first, it1); + ASSERT_EQ(result2.second, it1); } template diff --git a/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp b/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp index 774329eef7..f3b3e269c4 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp @@ -120,10 +120,10 @@ void run_single_scenario(ViewType view1, ViewType view2, const auto my_diff12 = my_res1.second - f2; const auto my_diff21 = my_res2.first - f1; const auto my_diff22 = my_res2.second - f2; - EXPECT_EQ(my_diff11, std_diff1); - EXPECT_EQ(my_diff12, std_diff2); - EXPECT_EQ(my_diff21, std_diff1); - EXPECT_EQ(my_diff22, std_diff2); + ASSERT_EQ(my_diff11, std_diff1); + ASSERT_EQ(my_diff12, std_diff2); + ASSERT_EQ(my_diff21, std_diff1); + ASSERT_EQ(my_diff22, std_diff2); } { @@ -134,10 +134,10 @@ void run_single_scenario(ViewType view1, ViewType view2, const auto my_diff12 = my_res1.second - KE::begin(view2); const auto my_diff21 = my_res2.first - KE::begin(view1); const auto my_diff22 = my_res2.second - KE::begin(view2); - EXPECT_EQ(my_diff11, std_diff1); - EXPECT_EQ(my_diff12, std_diff2); - EXPECT_EQ(my_diff21, std_diff1); - EXPECT_EQ(my_diff22, std_diff2); + ASSERT_EQ(my_diff11, std_diff1); + ASSERT_EQ(my_diff12, std_diff2); + ASSERT_EQ(my_diff21, std_diff1); + ASSERT_EQ(my_diff22, std_diff2); } } diff --git a/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index 4fce044bcf..4604764097 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -52,14 +52,14 @@ TEST(std_algorithms_mod_ops_test, move) { // move constr MyMovableType b(std::move(a)); - EXPECT_EQ(b.m_value, 11); - EXPECT_EQ(a.m_value, -2); + ASSERT_EQ(b.m_value, 11); + ASSERT_EQ(a.m_value, -2); // move assign MyMovableType c; c = std::move(b); - EXPECT_EQ(c.m_value, 11); - EXPECT_EQ(b.m_value, -4); + ASSERT_EQ(c.m_value, 11); + ASSERT_EQ(b.m_value, -4); } template @@ -97,8 +97,8 @@ TEST(std_algorithms_mod_ops_test, swap) { int a = 1; int b = 2; KE::swap(a, b); - EXPECT_EQ(a, 2); - EXPECT_EQ(b, 1); + ASSERT_EQ(a, 2); + ASSERT_EQ(b, 1); } { @@ -151,17 +151,17 @@ void test_iter_swap(ViewType view) { using value_type = typename ViewType::value_type; auto a_dc = create_deep_copyable_compatible_clone(view); auto a_h = create_mirror_view_and_copy(Kokkos::HostSpace(), a_dc); - EXPECT_EQ(view.extent_int(0), 10); - EXPECT_EQ(a_h(0), value_type(3)); - EXPECT_EQ(a_h(1), value_type(1)); - EXPECT_EQ(a_h(2), value_type(2)); - EXPECT_EQ(a_h(3), value_type(0)); - EXPECT_EQ(a_h(4), value_type(6)); - EXPECT_EQ(a_h(5), value_type(5)); - EXPECT_EQ(a_h(6), value_type(4)); - EXPECT_EQ(a_h(7), value_type(7)); - EXPECT_EQ(a_h(8), value_type(8)); - EXPECT_EQ(a_h(9), value_type(9)); + ASSERT_EQ(view.extent_int(0), 10); + ASSERT_EQ(a_h(0), value_type(3)); + ASSERT_EQ(a_h(1), value_type(1)); + ASSERT_EQ(a_h(2), value_type(2)); + ASSERT_EQ(a_h(3), value_type(0)); + ASSERT_EQ(a_h(4), value_type(6)); + ASSERT_EQ(a_h(5), value_type(5)); + ASSERT_EQ(a_h(6), value_type(4)); + ASSERT_EQ(a_h(7), value_type(7)); + ASSERT_EQ(a_h(8), value_type(8)); + ASSERT_EQ(a_h(9), value_type(9)); } TEST(std_algorithms_mod_ops_test, iter_swap_static_view) { diff --git a/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp b/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp index 6b806d7bc5..f80f30797e 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp @@ -34,21 +34,21 @@ struct std_algorithms_mod_seq_ops_test : std_algorithms_test { TEST_F(std_algorithms_mod_seq_ops_test, copy) { auto result = KE::copy(exespace(), KE::begin(m_static_view), KE::end(m_static_view), KE::begin(m_strided_view)); - EXPECT_EQ(KE::end(m_strided_view), result); + ASSERT_EQ(KE::end(m_strided_view), result); compare_views(m_static_view, m_strided_view); auto result2 = KE::copy(exespace(), KE::begin(m_strided_view), KE::end(m_strided_view), KE::begin(m_dynamic_view)); - EXPECT_EQ(KE::end(m_dynamic_view), result2); + ASSERT_EQ(KE::end(m_dynamic_view), result2); compare_views(m_dynamic_view, m_strided_view); } TEST_F(std_algorithms_mod_seq_ops_test, copy_view) { - EXPECT_EQ(KE::end(m_dynamic_view), + ASSERT_EQ(KE::end(m_dynamic_view), KE::copy(exespace(), m_static_view, m_dynamic_view)); compare_views(m_static_view, m_dynamic_view); - EXPECT_EQ(KE::end(m_strided_view), + ASSERT_EQ(KE::end(m_strided_view), KE::copy(exespace(), m_dynamic_view, m_strided_view)); compare_views(m_dynamic_view, m_strided_view); } @@ -70,11 +70,11 @@ TEST_F(std_algorithms_mod_seq_ops_test, copy_n) { // pass iterators auto first = KE::begin(m_static_view); auto dest = KE::begin(m_dynamic_view); - EXPECT_EQ(dest + n, KE::copy_n(exespace(), first, n, dest)); + ASSERT_EQ(dest + n, KE::copy_n(exespace(), first, n, dest)); compare_views(expected, m_dynamic_view); // pass views - EXPECT_EQ(KE::begin(m_strided_view) + n, + ASSERT_EQ(KE::begin(m_strided_view) + n, KE::copy_n(exespace(), m_static_view, n, m_strided_view)); compare_views(expected, m_strided_view); } @@ -85,12 +85,12 @@ TEST_F(std_algorithms_mod_seq_ops_test, copy_backward) { auto dest = KE::end(m_dynamic_view); // pass iterators - EXPECT_EQ(KE::begin(m_dynamic_view), + ASSERT_EQ(KE::begin(m_dynamic_view), KE::copy_backward(exespace(), first, last, dest)); compare_views(m_static_view, m_dynamic_view); // pass views - EXPECT_EQ(KE::begin(m_strided_view), + ASSERT_EQ(KE::begin(m_strided_view), KE::copy_backward(exespace(), m_static_view, m_strided_view)); compare_views(m_static_view, m_strided_view); } @@ -112,11 +112,11 @@ TEST_F(std_algorithms_mod_seq_ops_test, reverse_copy) { auto last = KE::end(m_static_view); auto dest = KE::begin(m_dynamic_view); - EXPECT_EQ(KE::end(m_dynamic_view), + ASSERT_EQ(KE::end(m_dynamic_view), KE::reverse_copy(exespace(), first, last, dest)); compare_views(expected, m_dynamic_view); - EXPECT_EQ(KE::end(m_strided_view), + ASSERT_EQ(KE::end(m_strided_view), KE::reverse_copy(exespace(), m_static_view, m_strided_view)); compare_views(expected, m_strided_view); } @@ -151,25 +151,25 @@ TEST_F(std_algorithms_mod_seq_ops_test, fill_n) { // fill all elements // pass iterator - EXPECT_EQ(KE::end(m_static_view), + ASSERT_EQ(KE::end(m_static_view), KE::fill_n(exespace(), KE::begin(m_static_view), m_static_view.extent(0), fill_n_value)); verify_values(fill_n_value, m_static_view); // pass view - EXPECT_EQ(KE::end(m_strided_view), + ASSERT_EQ(KE::end(m_strided_view), KE::fill_n(exespace(), m_strided_view, m_strided_view.extent(0), fill_n_value)); verify_values(fill_n_value, m_strided_view); // fill zero elements // pass view - EXPECT_EQ(KE::begin(m_dynamic_view), + ASSERT_EQ(KE::begin(m_dynamic_view), KE::fill_n(exespace(), m_dynamic_view, 0, fill_n_new_value)); // fill single element // pass iterator - EXPECT_EQ( + ASSERT_EQ( KE::begin(m_static_view) + 1, KE::fill_n(exespace(), KE::begin(m_static_view), 1, fill_n_new_value)); @@ -212,21 +212,21 @@ TEST_F(std_algorithms_mod_seq_ops_test, transform_from_fixture_unary_op) { auto r1 = KE::transform(exespace(), KE::begin(m_static_view), KE::end(m_static_view), KE::begin(m_dynamic_view), TransformFunctor()); - EXPECT_EQ(r1, KE::end(m_dynamic_view)); + ASSERT_EQ(r1, KE::end(m_dynamic_view)); compare_views(gold_source, m_static_view); verify_values(-1., m_dynamic_view); // transform dynamic view, store results in strided view auto r2 = KE::transform(exespace(), m_dynamic_view, m_strided_view, TransformFunctor()); - EXPECT_EQ(r2, KE::end(m_strided_view)); + ASSERT_EQ(r2, KE::end(m_strided_view)); verify_values(-1., m_dynamic_view); verify_values(-1., m_strided_view); // transform strided view, store results in static view auto r3 = KE::transform(exespace(), m_strided_view, m_static_view, TransformFunctor()); - EXPECT_EQ(r3, KE::end(m_static_view)); + ASSERT_EQ(r3, KE::end(m_static_view)); verify_values(-1., m_static_view); verify_values(-1., m_strided_view); } @@ -254,7 +254,7 @@ TEST_F(std_algorithms_mod_seq_ops_test, transform_from_fixture_binary_op) { auto r1 = KE::transform(exespace(), KE::begin(m_static_view), KE::end(m_static_view), KE::begin(m_dynamic_view), KE::begin(m_strided_view), TransformBinaryFunctor()); - EXPECT_EQ(r1, KE::end(m_strided_view)); + ASSERT_EQ(r1, KE::end(m_strided_view)); compare_views(expected, m_strided_view); expected(0) = 0; @@ -269,7 +269,7 @@ TEST_F(std_algorithms_mod_seq_ops_test, transform_from_fixture_binary_op) { expected(9) = 18; auto r2 = KE::transform("label", exespace(), m_static_view, m_strided_view, m_dynamic_view, TransformBinaryFunctor()); - EXPECT_EQ(r2, KE::end(m_dynamic_view)); + ASSERT_EQ(r2, KE::end(m_dynamic_view)); compare_views(expected, m_dynamic_view); } @@ -296,19 +296,19 @@ TEST_F(std_algorithms_mod_seq_ops_test, generate) { TEST_F(std_algorithms_mod_seq_ops_test, generate_n) { // iterator + functor - EXPECT_EQ(KE::end(m_static_view), + ASSERT_EQ(KE::end(m_static_view), KE::generate_n(exespace(), KE::begin(m_static_view), m_static_view.extent(0), GenerateFunctor())); verify_values(generated_value, m_static_view); // view + functor - EXPECT_EQ(KE::end(m_dynamic_view), + ASSERT_EQ(KE::end(m_dynamic_view), KE::generate_n(exespace(), m_dynamic_view, m_dynamic_view.extent(0), GenerateFunctor())); verify_values(generated_value, m_dynamic_view); // view + functor, negative n - EXPECT_EQ(KE::begin(m_strided_view), + ASSERT_EQ(KE::begin(m_strided_view), KE::generate_n(exespace(), m_strided_view, -1, GenerateFunctor())); } @@ -352,7 +352,7 @@ void test_swap_ranges(ViewType view) { auto last1 = first1 + 4; auto first2 = KE::begin(viewB) + 1; auto r = KE::swap_ranges(exespace(), first1, last1, first2); - EXPECT_EQ(r, first2 + 4); + ASSERT_EQ(r, first2 + 4); /* check VIEW_A */ static_view_type checkViewA("tmp"); @@ -360,16 +360,16 @@ void test_swap_ranges(ViewType view) { parallel_for(ext, cp_func_a_t(view, checkViewA)); auto cvA_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), checkViewA); - EXPECT_EQ(cvA_h(0), 0); - EXPECT_EQ(cvA_h(1), 1); - EXPECT_EQ(cvA_h(2), 99); - EXPECT_EQ(cvA_h(3), 98); - EXPECT_EQ(cvA_h(4), 97); - EXPECT_EQ(cvA_h(5), 96); - EXPECT_EQ(cvA_h(6), 6); - EXPECT_EQ(cvA_h(7), 7); - EXPECT_EQ(cvA_h(8), 8); - EXPECT_EQ(cvA_h(9), 9); + ASSERT_EQ(cvA_h(0), 0); + ASSERT_EQ(cvA_h(1), 1); + ASSERT_EQ(cvA_h(2), 99); + ASSERT_EQ(cvA_h(3), 98); + ASSERT_EQ(cvA_h(4), 97); + ASSERT_EQ(cvA_h(5), 96); + ASSERT_EQ(cvA_h(6), 6); + ASSERT_EQ(cvA_h(7), 7); + ASSERT_EQ(cvA_h(8), 8); + ASSERT_EQ(cvA_h(9), 9); /* check viewB */ static_view_type checkViewB("tmpB"); @@ -377,16 +377,16 @@ void test_swap_ranges(ViewType view) { Kokkos::parallel_for(ext, cp_func_b_t(viewB, checkViewB)); auto cvB_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), checkViewB); - EXPECT_EQ(cvB_h(0), 100); - EXPECT_EQ(cvB_h(1), 2); - EXPECT_EQ(cvB_h(2), 3); - EXPECT_EQ(cvB_h(3), 4); - EXPECT_EQ(cvB_h(4), 5); - EXPECT_EQ(cvB_h(5), 95); - EXPECT_EQ(cvB_h(6), 94); - EXPECT_EQ(cvB_h(7), 93); - EXPECT_EQ(cvB_h(8), 92); - EXPECT_EQ(cvB_h(9), 91); + ASSERT_EQ(cvB_h(0), 100); + ASSERT_EQ(cvB_h(1), 2); + ASSERT_EQ(cvB_h(2), 3); + ASSERT_EQ(cvB_h(3), 4); + ASSERT_EQ(cvB_h(4), 5); + ASSERT_EQ(cvB_h(5), 95); + ASSERT_EQ(cvB_h(6), 94); + ASSERT_EQ(cvB_h(7), 93); + ASSERT_EQ(cvB_h(8), 92); + ASSERT_EQ(cvB_h(9), 91); } TEST_F(std_algorithms_mod_seq_ops_test, swap_ranges) { diff --git a/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp b/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp index 635714eb54..b201ab95c1 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp @@ -53,20 +53,20 @@ void run_single_scenario(const InfoType& scenario_info, int apiId) { auto rit = KE::move_backward(exespace(), KE::begin(v), KE::end(v), KE::end(v2)); const int dist = KE::distance(KE::begin(v2), rit); - EXPECT_EQ(dist, 5); + ASSERT_EQ(dist, 5); } else if (apiId == 1) { auto rit = KE::move_backward("mylabel", exespace(), KE::begin(v), KE::end(v), KE::end(v2)); const int dist = KE::distance(KE::begin(v2), rit); - EXPECT_EQ(dist, 5); + ASSERT_EQ(dist, 5); } else if (apiId == 2) { auto rit = KE::move_backward(exespace(), v, v2); const int dist = KE::distance(KE::begin(v2), rit); - EXPECT_EQ(dist, 5); + ASSERT_EQ(dist, 5); } else if (apiId == 3) { auto rit = KE::move_backward("mylabel", exespace(), v, v2); const int dist = KE::distance(KE::begin(v2), rit); - EXPECT_EQ(dist, 5); + ASSERT_EQ(dist, 5); } // check diff --git a/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp b/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp index 288a67c369..0933c4e135 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp @@ -151,8 +151,8 @@ void run_and_check_transform_reduce_default(ViewType1 first_view, const auto r2 = KE::transform_reduce( "MYLABEL", ExecutionSpace(), KE::cbegin(first_view), KE::cbegin(first_view), KE::cbegin(second_view), init_value); - EXPECT_EQ(r1, init_value); - EXPECT_EQ(r2, init_value); + ASSERT_EQ(r1, init_value); + ASSERT_EQ(r2, init_value); // non-trivial cases const auto r3 = KE::transform_reduce(ExecutionSpace(), KE::cbegin(first_view), @@ -168,10 +168,10 @@ void run_and_check_transform_reduce_default(ViewType1 first_view, const auto r6 = KE::transform_reduce("MYLABEL", ExecutionSpace(), first_view, second_view, init_value); - EXPECT_EQ(r3, result_value); - EXPECT_EQ(r4, result_value); - EXPECT_EQ(r5, result_value); - EXPECT_EQ(r6, result_value); + ASSERT_EQ(r3, result_value); + ASSERT_EQ(r4, result_value); + ASSERT_EQ(r5, result_value); + ASSERT_EQ(r6, result_value); } TEST_F(std_algorithms_numerics_test, @@ -254,8 +254,8 @@ void run_and_check_transform_reduce_overloadA(ViewType1 first_view, KE::cbegin(first_view), KE::cbegin(second_view), init_value, std::forward(args)...); - EXPECT_EQ(r1, init_value); - EXPECT_EQ(r2, init_value); + ASSERT_EQ(r1, init_value); + ASSERT_EQ(r2, init_value); // non trivial cases const auto r3 = KE::transform_reduce( @@ -273,10 +273,10 @@ void run_and_check_transform_reduce_overloadA(ViewType1 first_view, KE::transform_reduce("MYLABEL", ExecutionSpace(), first_view, second_view, init_value, std::forward(args)...); - EXPECT_EQ(r3, result_value); - EXPECT_EQ(r4, result_value); - EXPECT_EQ(r5, result_value); - EXPECT_EQ(r6, result_value); + ASSERT_EQ(r3, result_value); + ASSERT_EQ(r4, result_value); + ASSERT_EQ(r5, result_value); + ASSERT_EQ(r6, result_value); } TEST_F(std_algorithms_numerics_test, @@ -373,8 +373,8 @@ void run_and_check_transform_reduce_overloadB(ViewType view, KE::cbegin(view), KE::cbegin(view), init_value, std::forward(args)...); - EXPECT_EQ(r1, init_value); - EXPECT_EQ(r2, init_value); + ASSERT_EQ(r1, init_value); + ASSERT_EQ(r2, init_value); // non trivial const auto r3 = @@ -390,10 +390,10 @@ void run_and_check_transform_reduce_overloadB(ViewType view, const auto r6 = KE::transform_reduce("MYLABEL", ExecutionSpace(), view, init_value, std::forward(args)...); - EXPECT_EQ(r3, result_value); - EXPECT_EQ(r4, result_value); - EXPECT_EQ(r5, result_value); - EXPECT_EQ(r6, result_value); + ASSERT_EQ(r3, result_value); + ASSERT_EQ(r4, result_value); + ASSERT_EQ(r5, result_value); + ASSERT_EQ(r6, result_value); } TEST_F(std_algorithms_numerics_test, @@ -447,8 +447,8 @@ void run_and_check_reduce_overloadA(ViewType view, ValueType non_trivial_result, KE::reduce(ExecutionSpace(), KE::cbegin(view), KE::cbegin(view)); const auto r2 = KE::reduce("MYLABEL", ExecutionSpace(), KE::cbegin(view), KE::cbegin(view)); - EXPECT_EQ(r1, trivial_result); - EXPECT_EQ(r2, trivial_result); + ASSERT_EQ(r1, trivial_result); + ASSERT_EQ(r2, trivial_result); // non trivial cases const auto r3 = @@ -458,10 +458,10 @@ void run_and_check_reduce_overloadA(ViewType view, ValueType non_trivial_result, const auto r5 = KE::reduce(ExecutionSpace(), view); const auto r6 = KE::reduce("MYLABEL", ExecutionSpace(), view); - EXPECT_EQ(r3, non_trivial_result); - EXPECT_EQ(r4, non_trivial_result); - EXPECT_EQ(r5, non_trivial_result); - EXPECT_EQ(r6, non_trivial_result); + ASSERT_EQ(r3, non_trivial_result); + ASSERT_EQ(r4, non_trivial_result); + ASSERT_EQ(r5, non_trivial_result); + ASSERT_EQ(r6, non_trivial_result); } TEST_F(std_algorithms_numerics_test, @@ -503,8 +503,8 @@ void run_and_check_reduce_overloadB(ViewType view, ValueType result_value, KE::cbegin(view), init_value); const auto r2 = KE::reduce("MYLABEL", ExecutionSpace(), KE::cbegin(view), KE::cbegin(view), init_value); - EXPECT_EQ(r1, init_value); - EXPECT_EQ(r2, init_value); + ASSERT_EQ(r1, init_value); + ASSERT_EQ(r2, init_value); // non trivial cases const auto r3 = KE::reduce(ExecutionSpace(), KE::cbegin(view), KE::cend(view), @@ -514,10 +514,10 @@ void run_and_check_reduce_overloadB(ViewType view, ValueType result_value, const auto r5 = KE::reduce(ExecutionSpace(), view, init_value); const auto r6 = KE::reduce("MYLABEL", ExecutionSpace(), view, init_value); - EXPECT_EQ(r3, result_value); - EXPECT_EQ(r4, result_value); - EXPECT_EQ(r5, result_value); - EXPECT_EQ(r6, result_value); + ASSERT_EQ(r3, result_value); + ASSERT_EQ(r4, result_value); + ASSERT_EQ(r5, result_value); + ASSERT_EQ(r6, result_value); } TEST_F(std_algorithms_numerics_test, @@ -553,8 +553,8 @@ void run_and_check_reduce_overloadC(ViewType view, ValueType result_value, KE::cbegin(view), init_value, joiner); const auto r2 = KE::reduce("MYLABEL", ExecutionSpace(), KE::cbegin(view), KE::cbegin(view), init_value, joiner); - EXPECT_EQ(r1, init_value); - EXPECT_EQ(r2, init_value); + ASSERT_EQ(r1, init_value); + ASSERT_EQ(r2, init_value); // non trivial cases const auto r3 = KE::reduce(ExecutionSpace(), KE::cbegin(view), KE::cend(view), @@ -565,10 +565,10 @@ void run_and_check_reduce_overloadC(ViewType view, ValueType result_value, const auto r6 = KE::reduce("MYLABEL", ExecutionSpace(), view, init_value, joiner); - EXPECT_EQ(r3, result_value); - EXPECT_EQ(r4, result_value); - EXPECT_EQ(r5, result_value); - EXPECT_EQ(r6, result_value); + ASSERT_EQ(r3, result_value); + ASSERT_EQ(r4, result_value); + ASSERT_EQ(r5, result_value); + ASSERT_EQ(r6, result_value); } TEST_F(std_algorithms_numerics_test, diff --git a/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp index 0399e9eee4..f169fd9ce8 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp @@ -130,12 +130,12 @@ void verify_data(const std::string& name, ResultType my_result, const std::size_t my_diff_true = my_result.first - KE::begin(view_dest_true); const std::size_t my_diff_false = my_result.second - KE::begin(view_dest_false); - EXPECT_EQ(std_diff_true, my_diff_true); - EXPECT_EQ(std_diff_false, my_diff_false); + ASSERT_EQ(std_diff_true, my_diff_true); + ASSERT_EQ(std_diff_false, my_diff_false); auto view_dest_true_h = create_host_space_copy(view_dest_true); for (std::size_t i = 0; i < std_diff_true; ++i) { - EXPECT_EQ(std_vec_true[i], view_dest_true_h(i)); + ASSERT_EQ(std_vec_true[i], view_dest_true_h(i)); // std::cout << "i= " << i << " " // << " std_true = " << std_vec_true[i] << " " // << " mine = " << view_dest_true_h(i) << '\n'; @@ -143,45 +143,45 @@ void verify_data(const std::string& name, ResultType my_result, auto view_dest_false_h = create_host_space_copy(view_dest_false); for (std::size_t i = 0; i < std_diff_false; ++i) { - EXPECT_EQ(std_vec_false[i], view_dest_false_h(i)); + ASSERT_EQ(std_vec_false[i], view_dest_false_h(i)); // std::cout << "i= " << i << " " // << " std_false = " << std_vec_false[i] << " " // << " mine = " << view_dest_false_h(i) << '\n'; } if (name == "empty") { - EXPECT_EQ(my_diff_true, 0u); - EXPECT_EQ(my_diff_false, 0u); + ASSERT_EQ(my_diff_true, 0u); + ASSERT_EQ(my_diff_false, 0u); } else if (name == "one-element-a") { - EXPECT_EQ(my_diff_true, 0u); - EXPECT_EQ(my_diff_false, 1u); + ASSERT_EQ(my_diff_true, 0u); + ASSERT_EQ(my_diff_false, 1u); } else if (name == "one-element-b") { - EXPECT_EQ(my_diff_true, 1u); - EXPECT_EQ(my_diff_false, 0u); + ASSERT_EQ(my_diff_true, 1u); + ASSERT_EQ(my_diff_false, 0u); } else if (name == "two-elements-a") { - EXPECT_EQ(my_diff_true, 1u); - EXPECT_EQ(my_diff_false, 1u); + ASSERT_EQ(my_diff_true, 1u); + ASSERT_EQ(my_diff_false, 1u); } else if (name == "two-elements-b") { - EXPECT_EQ(my_diff_true, 1u); - EXPECT_EQ(my_diff_false, 1u); + ASSERT_EQ(my_diff_true, 1u); + ASSERT_EQ(my_diff_false, 1u); } else if (name == "small-b") { - EXPECT_EQ(my_diff_true, 13u); - EXPECT_EQ(my_diff_false, 0u); + ASSERT_EQ(my_diff_true, 13u); + ASSERT_EQ(my_diff_false, 0u); } else if (name == "small-c") { - EXPECT_EQ(my_diff_true, 0u); - EXPECT_EQ(my_diff_false, 15u); + ASSERT_EQ(my_diff_true, 0u); + ASSERT_EQ(my_diff_false, 15u); } } diff --git a/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp b/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp index 94ec278af1..33a1326c47 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp @@ -171,15 +171,15 @@ TEST_F(std_algorithms_partitioning_test, is_partitioned_accepting_iterators) { goldSolutionIsPartitioned(static_cast(id)); const auto result1 = KE::is_partitioned( exespace(), KE::cbegin(m_static_view), KE::cend(m_static_view), p); - EXPECT_EQ(goldBool, result1); + ASSERT_EQ(goldBool, result1); const auto result2 = KE::is_partitioned( exespace(), KE::cbegin(m_dynamic_view), KE::cend(m_dynamic_view), p); - EXPECT_EQ(goldBool, result2); + ASSERT_EQ(goldBool, result2); const auto result3 = KE::is_partitioned( exespace(), KE::cbegin(m_strided_view), KE::cend(m_strided_view), p); - EXPECT_EQ(goldBool, result3); + ASSERT_EQ(goldBool, result3); } } @@ -191,13 +191,13 @@ TEST_F(std_algorithms_partitioning_test, is_partitioned_accepting_view) { const bool goldBool = goldSolutionIsPartitioned(static_cast(id)); const auto result1 = KE::is_partitioned(exespace(), m_static_view, p); - EXPECT_EQ(goldBool, result1); + ASSERT_EQ(goldBool, result1); const auto result2 = KE::is_partitioned(exespace(), m_dynamic_view, p); - EXPECT_EQ(goldBool, result2); + ASSERT_EQ(goldBool, result2); const auto result3 = KE::is_partitioned(exespace(), m_strided_view, p); - EXPECT_EQ(goldBool, result3); + ASSERT_EQ(goldBool, result3); } } @@ -211,17 +211,17 @@ TEST_F(std_algorithms_partitioning_test, partition_point) { auto first1 = KE::cbegin(m_static_view); auto last1 = KE::cend(m_static_view); const auto result1 = KE::partition_point(exespace(), first1, last1, p); - EXPECT_EQ(goldIndex, result1 - first1); + ASSERT_EQ(goldIndex, result1 - first1); auto first2 = KE::cbegin(m_dynamic_view); auto last2 = KE::cend(m_dynamic_view); const auto result2 = KE::partition_point(exespace(), first2, last2, p); - EXPECT_EQ(goldIndex, result2 - first2); + ASSERT_EQ(goldIndex, result2 - first2); auto first3 = KE::cbegin(m_strided_view); auto last3 = KE::cend(m_strided_view); const auto result3 = KE::partition_point(exespace(), first3, last3, p); - EXPECT_EQ(goldIndex, result3 - first3); + ASSERT_EQ(goldIndex, result3 - first3); } } diff --git a/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp b/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp index 8832d71f95..c35fc5c24b 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp @@ -117,12 +117,12 @@ void verify_data(ViewTypeData view_data_h, ViewTypeTest view_test, // check that returned iterators are correct const std::size_t std_diff = std_result - KE::begin(view_data_h); const std::size_t my_diff = my_result - KE::begin(view_test); - EXPECT_EQ(std_diff, my_diff); + ASSERT_EQ(std_diff, my_diff); // check the actual data after algo has been applied auto view_test_h = create_host_space_copy(view_test); for (std::size_t i = 0; i < my_diff; ++i) { - EXPECT_EQ(view_test_h(i), view_data_h[i]); + ASSERT_EQ(view_test_h(i), view_data_h[i]); // std::cout << "i= " << i << " " // << "mine: " << view_test_h(i) << " " // << "std: " << view_data_h(i) diff --git a/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp index 949f8f60c9..3d7c52108b 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp @@ -135,12 +135,12 @@ void verify_data(ViewFromType view_from, ViewDestType view_dest, // check that returned iterators are correct const std::size_t std_diff = std_result - gold_dest_std.begin(); const std::size_t my_diff = my_result - KE::begin(view_dest); - EXPECT_EQ(std_diff, my_diff); + ASSERT_EQ(std_diff, my_diff); // check the actual data after algo has been applied auto view_dest_h = create_host_space_copy(view_dest); for (std::size_t i = 0; i < my_diff; ++i) { - EXPECT_EQ(view_dest_h(i), gold_dest_std[i]); + ASSERT_EQ(view_dest_h(i), gold_dest_std[i]); // std::cout << "i= " << i << " " // << "mine: " << view_dest_h(i) << " " // << "std: " << gold_dest_std[i] diff --git a/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp index 9dc1e4a7e1..cb699aa923 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp @@ -119,12 +119,12 @@ void verify_data(ViewTypeFrom view_from, ViewTypeDest view_dest, // check that returned iterators are correct const std::size_t std_diff = std_result - gold_dest_std.begin(); const std::size_t my_diff = my_result - KE::begin(view_dest); - EXPECT_EQ(std_diff, my_diff); + ASSERT_EQ(std_diff, my_diff); // check the actual data after algo has been applied auto view_dest_h = create_host_space_copy(view_dest); for (std::size_t i = 0; i < my_diff; ++i) { - EXPECT_EQ(view_dest_h(i), gold_dest_std[i]); + ASSERT_EQ(view_dest_h(i), gold_dest_std[i]); // std::cout << "i= " << i << " " // << "mine: " << view_dest_h(i) << " " // << "std: " << gold_dest_std[i] diff --git a/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp index e9d15f29d8..f06f2234ee 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp @@ -112,12 +112,12 @@ void verify_data(ViewTypeData view_data_h, ViewTypeTest view_test, // check that returned iterators are correct const std::size_t std_diff = std_result - KE::begin(view_data_h); const std::size_t my_diff = my_result - KE::begin(view_test); - EXPECT_EQ(std_diff, my_diff); + ASSERT_EQ(std_diff, my_diff); // check the actual data after algo has been applied auto view_test_h = create_host_space_copy(view_test); for (std::size_t i = 0; i < my_diff; ++i) { - EXPECT_EQ(view_test_h(i), view_data_h[i]); + ASSERT_EQ(view_test_h(i), view_data_h[i]); // std::cout << "i= " << i << " " // << "mine: " << view_test_h(i) << " " // << "std: " << view_data_h(i) diff --git a/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp b/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp index b226de5535..a22ab32d76 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp @@ -104,30 +104,30 @@ void verify_data(const std::string& name, ViewType1 test_view, } else if (name == "one-element-a") { - EXPECT_EQ(view_h(0), ValueType{1}); + ASSERT_EQ(view_h(0), ValueType{1}); } else if (name == "one-element-b") { - EXPECT_EQ(view_h(0), new_value); + ASSERT_EQ(view_h(0), new_value); } else if (name == "two-elements-a") { - EXPECT_EQ(view_h(0), ValueType{1}); - EXPECT_EQ(view_h(1), new_value); + ASSERT_EQ(view_h(0), ValueType{1}); + ASSERT_EQ(view_h(1), new_value); } else if (name == "two-elements-b") { - EXPECT_EQ(view_h(0), new_value); - EXPECT_EQ(view_h(1), ValueType{-1}); + ASSERT_EQ(view_h(0), new_value); + ASSERT_EQ(view_h(1), ValueType{-1}); } else if (name == "small-a") { for (std::size_t i = 0; i < view_h.extent(0); ++i) { if (i == 0 || i == 3 || i == 5 || i == 6) { - EXPECT_EQ(view_h(i), new_value); + ASSERT_EQ(view_h(i), new_value); } else { const auto gold = ValueType{-5} + static_cast(i + 1); - EXPECT_EQ(view_h(i), gold); + ASSERT_EQ(view_h(i), gold); } } } @@ -135,9 +135,9 @@ void verify_data(const std::string& name, ViewType1 test_view, else if (name == "small-b") { for (std::size_t i = 0; i < view_h.extent(0); ++i) { if (i < 4) { - EXPECT_EQ(view_h(i), ValueType{-1}); + ASSERT_EQ(view_h(i), ValueType{-1}); } else { - EXPECT_EQ(view_h(i), new_value); + ASSERT_EQ(view_h(i), new_value); } } } @@ -145,9 +145,9 @@ void verify_data(const std::string& name, ViewType1 test_view, else if (name == "medium" || name == "large") { for (std::size_t i = 0; i < view_h.extent(0); ++i) { if (i % 2 == 0) { - EXPECT_EQ(view_h(i), ValueType{-1}); + ASSERT_EQ(view_h(i), ValueType{-1}); } else { - EXPECT_EQ(view_h(i), new_value); + ASSERT_EQ(view_h(i), new_value); } } } diff --git a/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp index 16b181fdd2..a964ec8e17 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp @@ -112,40 +112,40 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else if (name == "one-element-a") { - EXPECT_EQ(view_from_h(0), ValueType{1}); - EXPECT_EQ(view_test_h(0), view_from_h(0)); + ASSERT_EQ(view_from_h(0), ValueType{1}); + ASSERT_EQ(view_test_h(0), view_from_h(0)); } else if (name == "one-element-b") { - EXPECT_EQ(view_from_h(0), ValueType{2}); - EXPECT_EQ(view_test_h(0), new_value); + ASSERT_EQ(view_from_h(0), ValueType{2}); + ASSERT_EQ(view_test_h(0), new_value); } else if (name == "two-elements-a") { - EXPECT_EQ(view_from_h(0), ValueType{1}); - EXPECT_EQ(view_from_h(1), ValueType{2}); + ASSERT_EQ(view_from_h(0), ValueType{1}); + ASSERT_EQ(view_from_h(1), ValueType{2}); - EXPECT_EQ(view_test_h(0), view_from_h(0)); - EXPECT_EQ(view_test_h(1), new_value); + ASSERT_EQ(view_test_h(0), view_from_h(0)); + ASSERT_EQ(view_test_h(1), new_value); } else if (name == "two-elements-b") { - EXPECT_EQ(view_from_h(0), ValueType{2}); - EXPECT_EQ(view_from_h(1), ValueType{-1}); + ASSERT_EQ(view_from_h(0), ValueType{2}); + ASSERT_EQ(view_from_h(1), ValueType{-1}); - EXPECT_EQ(view_test_h(0), new_value); - EXPECT_EQ(view_test_h(1), view_from_h(1)); + ASSERT_EQ(view_test_h(0), new_value); + ASSERT_EQ(view_test_h(1), view_from_h(1)); } else if (name == "small-a") { for (std::size_t i = 0; i < view_test_h.extent(0); ++i) { if (i == 0 || i == 3 || i == 5 || i == 6) { - EXPECT_EQ(view_from_h(i), ValueType{2}); - EXPECT_EQ(view_test_h(i), new_value); + ASSERT_EQ(view_from_h(i), ValueType{2}); + ASSERT_EQ(view_test_h(i), new_value); } else { const auto gold = ValueType{-5} + static_cast(i + 1); - EXPECT_EQ(view_from_h(i), gold); - EXPECT_EQ(view_test_h(i), gold); + ASSERT_EQ(view_from_h(i), gold); + ASSERT_EQ(view_test_h(i), gold); } } } @@ -153,11 +153,11 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, else if (name == "small-b") { for (std::size_t i = 0; i < view_test_h.extent(0); ++i) { if (i < 4) { - EXPECT_EQ(view_from_h(i), ValueType{-1}); - EXPECT_EQ(view_test_h(i), view_from_h(i)); + ASSERT_EQ(view_from_h(i), ValueType{-1}); + ASSERT_EQ(view_test_h(i), view_from_h(i)); } else { - EXPECT_EQ(view_from_h(i), ValueType{2}); - EXPECT_EQ(view_test_h(i), new_value); + ASSERT_EQ(view_from_h(i), ValueType{2}); + ASSERT_EQ(view_test_h(i), new_value); } } } @@ -165,11 +165,11 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, else if (name == "medium" || name == "large") { for (std::size_t i = 0; i < view_test_h.extent(0); ++i) { if (i % 2 == 0) { - EXPECT_EQ(view_from_h(i), ValueType{-1}); - EXPECT_EQ(view_test_h(i), view_from_h(i)); + ASSERT_EQ(view_from_h(i), ValueType{-1}); + ASSERT_EQ(view_test_h(i), view_from_h(i)); } else { - EXPECT_EQ(view_from_h(i), ValueType{2}); - EXPECT_EQ(view_test_h(i), new_value); + ASSERT_EQ(view_from_h(i), ValueType{2}); + ASSERT_EQ(view_test_h(i), new_value); } } } @@ -202,7 +202,7 @@ void run_single_scenario(const InfoType& scenario_info) { KE::replace_copy(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), old_value, new_value); verify_data(name, view_from, view_dest, new_value); - EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext)); + ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } { @@ -215,7 +215,7 @@ void run_single_scenario(const InfoType& scenario_info) { KE::cend(view_from), KE::begin(view_dest), old_value, new_value); verify_data(name, view_from, view_dest, new_value); - EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext)); + ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } { @@ -227,7 +227,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto rit = KE::replace_copy(exespace(), view_from, view_dest, old_value, new_value); verify_data(name, view_from, view_dest, new_value); - EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext)); + ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } { @@ -239,7 +239,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto rit = KE::replace_copy("label", exespace(), view_from, view_dest, old_value, new_value); verify_data(name, view_from, view_dest, new_value); - EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext)); + ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } Kokkos::fence(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp index a402e30ad9..ceeba88971 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp @@ -112,40 +112,40 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else if (name == "one-element-a") { - EXPECT_EQ(view_from_h(0), ValueType{1}); - EXPECT_EQ(view_test_h(0), view_from_h(0)); + ASSERT_EQ(view_from_h(0), ValueType{1}); + ASSERT_EQ(view_test_h(0), view_from_h(0)); } else if (name == "one-element-b") { - EXPECT_EQ(view_from_h(0), ValueType{2}); - EXPECT_EQ(view_test_h(0), new_value); + ASSERT_EQ(view_from_h(0), ValueType{2}); + ASSERT_EQ(view_test_h(0), new_value); } else if (name == "two-elements-a") { - EXPECT_EQ(view_from_h(0), ValueType{1}); - EXPECT_EQ(view_from_h(1), ValueType{2}); + ASSERT_EQ(view_from_h(0), ValueType{1}); + ASSERT_EQ(view_from_h(1), ValueType{2}); - EXPECT_EQ(view_test_h(0), view_from_h(0)); - EXPECT_EQ(view_test_h(1), new_value); + ASSERT_EQ(view_test_h(0), view_from_h(0)); + ASSERT_EQ(view_test_h(1), new_value); } else if (name == "two-elements-b") { - EXPECT_EQ(view_from_h(0), ValueType{2}); - EXPECT_EQ(view_from_h(1), ValueType{-1}); + ASSERT_EQ(view_from_h(0), ValueType{2}); + ASSERT_EQ(view_from_h(1), ValueType{-1}); - EXPECT_EQ(view_test_h(0), new_value); - EXPECT_EQ(view_test_h(1), view_from_h(1)); + ASSERT_EQ(view_test_h(0), new_value); + ASSERT_EQ(view_test_h(1), view_from_h(1)); } else if (name == "small-a") { for (std::size_t i = 0; i < view_test_h.extent(0); ++i) { if (i == 0 || i == 3 || i == 5 || i == 6) { - EXPECT_EQ(view_from_h(i), ValueType{2}); - EXPECT_EQ(view_test_h(i), new_value); + ASSERT_EQ(view_from_h(i), ValueType{2}); + ASSERT_EQ(view_test_h(i), new_value); } else { const auto gold = ValueType{-5} + static_cast(i + 1); - EXPECT_EQ(view_from_h(i), gold); - EXPECT_EQ(view_test_h(i), gold); + ASSERT_EQ(view_from_h(i), gold); + ASSERT_EQ(view_test_h(i), gold); } } } @@ -153,11 +153,11 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, else if (name == "small-b") { for (std::size_t i = 0; i < view_test_h.extent(0); ++i) { if (i < 4) { - EXPECT_EQ(view_from_h(i), ValueType{-1}); - EXPECT_EQ(view_test_h(i), view_from_h(i)); + ASSERT_EQ(view_from_h(i), ValueType{-1}); + ASSERT_EQ(view_test_h(i), view_from_h(i)); } else { - EXPECT_EQ(view_from_h(i), ValueType{2}); - EXPECT_EQ(view_test_h(i), new_value); + ASSERT_EQ(view_from_h(i), ValueType{2}); + ASSERT_EQ(view_test_h(i), new_value); } } } @@ -165,11 +165,11 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, else if (name == "medium" || name == "large") { for (std::size_t i = 0; i < view_test_h.extent(0); ++i) { if (i % 2 == 0) { - EXPECT_EQ(view_from_h(i), ValueType{-1}); - EXPECT_EQ(view_test_h(i), view_from_h(i)); + ASSERT_EQ(view_from_h(i), ValueType{-1}); + ASSERT_EQ(view_test_h(i), view_from_h(i)); } else { - EXPECT_EQ(view_from_h(i), ValueType{2}); - EXPECT_EQ(view_test_h(i), new_value); + ASSERT_EQ(view_from_h(i), ValueType{2}); + ASSERT_EQ(view_test_h(i), new_value); } } } @@ -209,7 +209,7 @@ void run_single_scenario(const InfoType& scenario_info) { KE::cend(view_from), KE::begin(view_dest), pred_type(), new_value); verify_data(name, view_from, view_dest, new_value); - EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext)); + ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } { @@ -220,7 +220,7 @@ void run_single_scenario(const InfoType& scenario_info) { KE::cend(view_from), KE::begin(view_dest), pred_type(), new_value); verify_data(name, view_from, view_dest, new_value); - EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext)); + ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } { @@ -230,7 +230,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto rit = KE::replace_copy_if(exespace(), view_from, view_dest, pred_type(), new_value); verify_data(name, view_from, view_dest, new_value); - EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext)); + ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } { @@ -240,7 +240,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto rit = KE::replace_copy_if("label", exespace(), view_from, view_dest, pred_type(), new_value); verify_data(name, view_from, view_dest, new_value); - EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext)); + ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } Kokkos::fence(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp index f481144e1c..802c0093c5 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp @@ -138,7 +138,7 @@ void verify_data(ViewType1 data_view, // contains data // << data_view_dc(i) << " " // << data_view_h(i) << " " // << test_view_h(i) << std::endl; - EXPECT_EQ(data_view_h(i), test_view_h(i)); + ASSERT_EQ(data_view_h(i), test_view_h(i)); } } } diff --git a/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp b/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp index 7d16e54029..6e6ca72783 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp @@ -77,7 +77,7 @@ void verify_data(ViewType1 test_view, ViewType2 orig_view) { const std::size_t ext = test_view.extent(0); for (std::size_t i = 0; i < ext; ++i) { - EXPECT_EQ(tv_h(i), ov_h(ext - i - 1)); + ASSERT_EQ(tv_h(i), ov_h(ext - i - 1)); } } diff --git a/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp b/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp index a5a6f99bac..5638cbee4a 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp @@ -136,13 +136,13 @@ void verify_data(ResultIt result_it, ViewType view, ViewHostType data_view_host, // make sure results match const auto my_diff = result_it - KE::begin(view); const auto std_diff = std_rit - KE::begin(data_view_host); - EXPECT_EQ(my_diff, std_diff); + ASSERT_EQ(my_diff, std_diff); // check views match auto view_h = create_host_space_copy(view); const std::size_t ext = view_h.extent(0); for (std::size_t i = 0; i < ext; ++i) { - EXPECT_EQ(view_h(i), data_view_host[i]); + ASSERT_EQ(view_h(i), data_view_host[i]); // std::cout << "i= " << i << " " // << "mine: " << view_h(i) << " " // << "std: " << data_view_host(i) diff --git a/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp index 27451a1d04..d0caca7cea 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp @@ -139,7 +139,7 @@ void verify_data(ViewTypeFrom view_from, ViewTypeTest view_test, std_gold_h.begin()); for (std::size_t i = 0; i < ext; ++i) { - EXPECT_EQ(view_test_h(i), std_gold_h[i]); + ASSERT_EQ(view_test_h(i), std_gold_h[i]); // std::cout << "i= " << i << " " // << "from: " << view_from_h(i) << " " // << "mine: " << view_test_h(i) << " " @@ -177,7 +177,7 @@ void run_single_scenario(const InfoType& scenario_info, auto rit = KE::rotate_copy(exespace(), KE::cbegin(view_from), n_it, KE::cend(view_from), KE::begin(view_dest)); verify_data(view_from, view_dest, rotation_point); - EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext)); + ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } { @@ -187,7 +187,7 @@ void run_single_scenario(const InfoType& scenario_info, auto rit = KE::rotate_copy("label", exespace(), KE::cbegin(view_from), n_it, KE::cend(view_from), KE::begin(view_dest)); verify_data(view_from, view_dest, rotation_point); - EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext)); + ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } { @@ -196,7 +196,7 @@ void run_single_scenario(const InfoType& scenario_info, auto rit = KE::rotate_copy(exespace(), view_from, rotation_point, view_dest); verify_data(view_from, view_dest, rotation_point); - EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext)); + ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } { @@ -205,7 +205,7 @@ void run_single_scenario(const InfoType& scenario_info, auto rit = KE::rotate_copy("label", exespace(), view_from, rotation_point, view_dest); verify_data(view_from, view_dest, rotation_point); - EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext)); + ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } Kokkos::fence(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp b/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp index c25b82a245..021609c444 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp @@ -259,7 +259,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext, KE::cbegin(s_view), KE::cend(s_view), args...); const auto mydiff = myrit - KE::cbegin(view); const auto stddiff = stdrit - KE::cbegin(view_h); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } { @@ -268,21 +268,21 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext, KE::cbegin(s_view), KE::cend(s_view), args...); const auto mydiff = myrit - KE::cbegin(view); const auto stddiff = stdrit - KE::cbegin(view_h); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } { auto myrit = KE::search(exespace(), view, s_view, args...); const auto mydiff = myrit - KE::begin(view); const auto stddiff = stdrit - KE::cbegin(view_h); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } { auto myrit = KE::search("label", exespace(), view, s_view, args...); const auto mydiff = myrit - KE::begin(view); const auto stddiff = stdrit - KE::cbegin(view_h); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } Kokkos::fence(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp b/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp index 68e2b1bf0f..53ad8daa2e 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp @@ -203,26 +203,26 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t count, auto myrit = KE::search_n(exespace(), KE::cbegin(view), KE::cend(view), count, value, args...); const auto mydiff = myrit - KE::cbegin(view); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } { auto myrit = KE::search_n("label", exespace(), KE::cbegin(view), KE::cend(view), count, value, args...); const auto mydiff = myrit - KE::cbegin(view); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } { auto myrit = KE::search_n("label", exespace(), view, count, value, args...); const auto mydiff = myrit - KE::begin(view); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } { auto myrit = KE::search_n(exespace(), view, count, value, args...); const auto mydiff = myrit - KE::begin(view); - EXPECT_EQ(mydiff, stddiff); + ASSERT_EQ(mydiff, stddiff); } Kokkos::fence(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp b/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp index 8e4ced9635..0b5fe9216e 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp @@ -103,12 +103,12 @@ void verify_data(ResultIt result_it, ViewType view, ViewHostType data_view_host, // make sure results match const auto my_diff = result_it - KE::begin(view); const auto std_diff = std_rit - KE::begin(data_view_host); - EXPECT_EQ(my_diff, std_diff); + ASSERT_EQ(my_diff, std_diff); // check views match auto view_h = create_host_space_copy(view); for (std::size_t i = 0; i < (std::size_t)my_diff; ++i) { - EXPECT_EQ(view_h(i), data_view_host[i]); + ASSERT_EQ(view_h(i), data_view_host[i]); // std::cout << "i= " << i << " " // << "mine: " << view_h(i) << " " // << "std: " << data_view_host(i) diff --git a/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp b/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp index a1614be027..8e4ae94375 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp @@ -101,14 +101,14 @@ void verify_data(ResultIt result_it, ViewType view, ViewHostType data_view_host, // make sure results match const auto my_diff = KE::end(view) - result_it; const auto std_diff = KE::end(data_view_host) - std_rit; - EXPECT_EQ(my_diff, std_diff); + ASSERT_EQ(my_diff, std_diff); // check views match auto view_h = create_host_space_copy(view); auto it1 = KE::cbegin(view_h); auto it2 = KE::cbegin(data_view_host); for (std::size_t i = 0; i < (std::size_t)my_diff; ++i) { - EXPECT_EQ(it1[i], it2[i]); + ASSERT_EQ(it1[i], it2[i]); // std::cout << "i= " << i << " " // << "mine: " << it1[i] << " " // << "std: " << it2[i] diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index fcbccc221c..75525b3b0f 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -165,7 +165,7 @@ void verify_data(ViewType1 data_view, // contains data // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; if (std::is_same::value) { - EXPECT_EQ(gold_h(i), test_view_h(i)); + ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); if (error > 1e-10) { @@ -221,7 +221,7 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, auto r = KE::transform_exclusive_scan( exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), init_value, bop, uop); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, init_value, bop, uop); } @@ -230,7 +230,7 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, auto r = KE::transform_exclusive_scan( "label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), init_value, bop, uop); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, init_value, bop, uop); } @@ -238,7 +238,7 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, fill_zero(view_dest); auto r = KE::transform_exclusive_scan(exespace(), view_from, view_dest, init_value, bop, uop); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, init_value, bop, uop); } @@ -246,7 +246,7 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, fill_zero(view_dest); auto r = KE::transform_exclusive_scan("label", exespace(), view_from, view_dest, init_value, bop, uop); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, init_value, bop, uop); } @@ -303,33 +303,33 @@ TEST(std_algorithms_numeric_ops_test, transform_exclusive_scan_functor) { value_type value1; functor.init(value1); - EXPECT_EQ(value1.val, 0); - EXPECT_EQ(value1.is_initial, true); + ASSERT_EQ(value1.val, 0); + ASSERT_EQ(value1.is_initial, true); value_type value2; value2.val = 1; value2.is_initial = false; functor.join(value1, value2); - EXPECT_EQ(value1.val, 1); - EXPECT_EQ(value1.is_initial, false); + ASSERT_EQ(value1.val, 1); + ASSERT_EQ(value1.is_initial, false); functor.init(value1); functor.join(value2, value1); - EXPECT_EQ(value2.val, 1); - EXPECT_EQ(value2.is_initial, false); + ASSERT_EQ(value2.val, 1); + ASSERT_EQ(value2.is_initial, false); functor.init(value2); functor.join(value2, value1); - EXPECT_EQ(value2.val, 0); - EXPECT_EQ(value2.is_initial, true); + ASSERT_EQ(value2.val, 0); + ASSERT_EQ(value2.is_initial, true); value1.val = 3; value1.is_initial = false; value2.val = 2; value2.is_initial = false; functor.join(value2, value1); - EXPECT_EQ(value2.val, 6); - EXPECT_EQ(value2.is_initial, false); + ASSERT_EQ(value2.val, 6); + ASSERT_EQ(value2.is_initial, false); } } // namespace TransformEScan diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index 095e490aa5..5d122ac5e8 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -177,7 +177,7 @@ void verify_data(ViewType1 data_view, // contains data // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; if (std::is_same::value) { - EXPECT_EQ(gold_h(i), test_view_h(i)); + ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); if (error > 1e-10) { @@ -246,7 +246,7 @@ void run_single_scenario(const InfoType& scenario_info, auto r = KE::transform_inclusive_scan(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), args...); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, args...); } @@ -255,7 +255,7 @@ void run_single_scenario(const InfoType& scenario_info, auto r = KE::transform_inclusive_scan( "label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), args...); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, args...); } @@ -263,7 +263,7 @@ void run_single_scenario(const InfoType& scenario_info, fill_zero(view_dest); auto r = KE::transform_inclusive_scan(exespace(), view_from, view_dest, args...); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, args...); } @@ -271,7 +271,7 @@ void run_single_scenario(const InfoType& scenario_info, fill_zero(view_dest); auto r = KE::transform_inclusive_scan("label", exespace(), view_from, view_dest, args...); - EXPECT_EQ(r, KE::end(view_dest)); + ASSERT_EQ(r, KE::end(view_dest)); verify_data(view_from, view_dest, args...); } @@ -320,33 +320,33 @@ TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan_functor) { auto test_lambda = [&](auto& functor) { value_type value1; functor.init(value1); - EXPECT_EQ(value1.val, 0); - EXPECT_EQ(value1.is_initial, true); + ASSERT_EQ(value1.val, 0); + ASSERT_EQ(value1.is_initial, true); value_type value2; value2.val = 1; value2.is_initial = false; functor.join(value1, value2); - EXPECT_EQ(value1.val, 1); - EXPECT_EQ(value1.is_initial, false); + ASSERT_EQ(value1.val, 1); + ASSERT_EQ(value1.is_initial, false); functor.init(value1); functor.join(value2, value1); - EXPECT_EQ(value2.val, 1); - EXPECT_EQ(value2.is_initial, false); + ASSERT_EQ(value2.val, 1); + ASSERT_EQ(value2.is_initial, false); functor.init(value2); functor.join(value2, value1); - EXPECT_EQ(value2.val, 0); - EXPECT_EQ(value2.is_initial, true); + ASSERT_EQ(value2.val, 0); + ASSERT_EQ(value2.is_initial, true); value1.val = 3; value1.is_initial = false; value2.val = 2; value2.is_initial = false; functor.join(value2, value1); - EXPECT_EQ(value2.val, 6); - EXPECT_EQ(value2.is_initial, false); + ASSERT_EQ(value2.val, 6); + ASSERT_EQ(value2.is_initial, false); }; int dummy = 0; diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformUnaryOp.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformUnaryOp.cpp index dab81b8f1e..6070c1a60d 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformUnaryOp.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformUnaryOp.cpp @@ -58,7 +58,7 @@ void verify_data(ViewTypeFrom view_from, ViewTypeTest view_test) { create_mirror_view_and_copy(Kokkos::HostSpace(), view_from_dc); for (std::size_t i = 0; i < view_test_h.extent(0); ++i) { - EXPECT_EQ(view_test_h(i), view_from_h(i) + value_type(1)); + ASSERT_EQ(view_test_h(i), view_from_h(i) + value_type(1)); } } @@ -89,7 +89,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto r1 = KE::transform(exespace(), KE::begin(view_from), KE::end(view_from), KE::begin(view_dest), unOp); verify_data(view_from, view_dest); - EXPECT_EQ(r1, KE::end(view_dest)); + ASSERT_EQ(r1, KE::end(view_dest)); } { @@ -98,7 +98,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto r1 = KE::transform("label", exespace(), KE::begin(view_from), KE::end(view_from), KE::begin(view_dest), unOp); verify_data(view_from, view_dest); - EXPECT_EQ(r1, KE::end(view_dest)); + ASSERT_EQ(r1, KE::end(view_dest)); } { @@ -106,7 +106,7 @@ void run_single_scenario(const InfoType& scenario_info) { create_view(Tag{}, view_ext, "transform_uop_dest"); auto r1 = KE::transform(exespace(), view_from, view_dest, unOp); verify_data(view_from, view_dest); - EXPECT_EQ(r1, KE::end(view_dest)); + ASSERT_EQ(r1, KE::end(view_dest)); } { @@ -114,7 +114,7 @@ void run_single_scenario(const InfoType& scenario_info) { create_view(Tag{}, view_ext, "transform_uop_dest"); auto r1 = KE::transform("label", exespace(), view_from, view_dest, unOp); verify_data(view_from, view_dest); - EXPECT_EQ(r1, KE::end(view_dest)); + ASSERT_EQ(r1, KE::end(view_dest)); } Kokkos::fence(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp b/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp index a810d31d82..9c5ae0cf8a 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp @@ -157,7 +157,7 @@ void verify_data(const std::string& name, ResultIt my_result_it, // const auto std_diff = (std::size_t)(std_r - KE::begin(data_v_h)); const auto my_diff = (std::size_t)(my_result_it - KE::begin(view_test)); - EXPECT_EQ(my_diff, std_diff); + ASSERT_EQ(my_diff, std_diff); // // check the data in the view @@ -170,14 +170,14 @@ void verify_data(const std::string& name, ResultIt my_result_it, // << " my = " << view_test_h(i) << " " // << " std = " << data_v_h(i) // << '\n'; - EXPECT_EQ(view_test_h(i), data_v_h(i)); + ASSERT_EQ(view_test_h(i), data_v_h(i)); } if (name == "medium-b") { using value_type = typename ViewType1::value_type; - EXPECT_EQ(my_diff, (std::size_t)2); - EXPECT_EQ(view_test_h(0), (value_type)22); - EXPECT_EQ(view_test_h(1), (value_type)44); + ASSERT_EQ(my_diff, (std::size_t)2); + ASSERT_EQ(view_test_h(0), (value_type)22); + ASSERT_EQ(view_test_h(1), (value_type)44); } } diff --git a/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp index f609d8517e..3cf43ad4db 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp @@ -174,51 +174,51 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else if (name == "one-element-a") { - EXPECT_EQ(view_test_h(0), static_cast(1)); + ASSERT_EQ(view_test_h(0), static_cast(1)); } else if (name == "one-element-b") { - EXPECT_EQ(view_test_h(0), static_cast(2)); + ASSERT_EQ(view_test_h(0), static_cast(2)); } else if (name == "two-elements-a") { - EXPECT_EQ(view_test_h(0), static_cast(1)); - EXPECT_EQ(view_test_h(1), static_cast(2)); + ASSERT_EQ(view_test_h(0), static_cast(1)); + ASSERT_EQ(view_test_h(1), static_cast(2)); } else if (name == "two-elements-b") { - EXPECT_EQ(view_test_h(0), static_cast(2)); - EXPECT_EQ(view_test_h(1), static_cast(-1)); + ASSERT_EQ(view_test_h(0), static_cast(2)); + ASSERT_EQ(view_test_h(1), static_cast(-1)); } else if (name == "small-a") { - EXPECT_EQ(view_test_h(0), static_cast(0)); - EXPECT_EQ(view_test_h(1), static_cast(1)); - EXPECT_EQ(view_test_h(2), static_cast(2)); - EXPECT_EQ(view_test_h(3), static_cast(3)); - EXPECT_EQ(view_test_h(4), static_cast(4)); - EXPECT_EQ(view_test_h(5), static_cast(5)); - EXPECT_EQ(view_test_h(6), static_cast(6)); - EXPECT_EQ(view_test_h(7), static_cast(0)); - EXPECT_EQ(view_test_h(8), static_cast(0)); - EXPECT_EQ(view_test_h(9), static_cast(0)); - EXPECT_EQ(view_test_h(10), static_cast(0)); + ASSERT_EQ(view_test_h(0), static_cast(0)); + ASSERT_EQ(view_test_h(1), static_cast(1)); + ASSERT_EQ(view_test_h(2), static_cast(2)); + ASSERT_EQ(view_test_h(3), static_cast(3)); + ASSERT_EQ(view_test_h(4), static_cast(4)); + ASSERT_EQ(view_test_h(5), static_cast(5)); + ASSERT_EQ(view_test_h(6), static_cast(6)); + ASSERT_EQ(view_test_h(7), static_cast(0)); + ASSERT_EQ(view_test_h(8), static_cast(0)); + ASSERT_EQ(view_test_h(9), static_cast(0)); + ASSERT_EQ(view_test_h(10), static_cast(0)); } else if (name == "small-b") { - EXPECT_EQ(view_test_h(0), static_cast(1)); - EXPECT_EQ(view_test_h(1), static_cast(2)); - EXPECT_EQ(view_test_h(2), static_cast(3)); - EXPECT_EQ(view_test_h(3), static_cast(4)); - EXPECT_EQ(view_test_h(4), static_cast(5)); - EXPECT_EQ(view_test_h(5), static_cast(6)); - EXPECT_EQ(view_test_h(6), static_cast(8)); - EXPECT_EQ(view_test_h(7), static_cast(9)); - EXPECT_EQ(view_test_h(8), static_cast(8)); - EXPECT_EQ(view_test_h(9), static_cast(0)); - EXPECT_EQ(view_test_h(10), static_cast(0)); - EXPECT_EQ(view_test_h(11), static_cast(0)); - EXPECT_EQ(view_test_h(12), static_cast(0)); + ASSERT_EQ(view_test_h(0), static_cast(1)); + ASSERT_EQ(view_test_h(1), static_cast(2)); + ASSERT_EQ(view_test_h(2), static_cast(3)); + ASSERT_EQ(view_test_h(3), static_cast(4)); + ASSERT_EQ(view_test_h(4), static_cast(5)); + ASSERT_EQ(view_test_h(5), static_cast(6)); + ASSERT_EQ(view_test_h(6), static_cast(8)); + ASSERT_EQ(view_test_h(7), static_cast(9)); + ASSERT_EQ(view_test_h(8), static_cast(8)); + ASSERT_EQ(view_test_h(9), static_cast(0)); + ASSERT_EQ(view_test_h(10), static_cast(0)); + ASSERT_EQ(view_test_h(11), static_cast(0)); + ASSERT_EQ(view_test_h(12), static_cast(0)); } else if (name == "medium" || name == "large") { @@ -230,7 +230,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, (void)std_r; for (std::size_t i = 0; i < view_from_h.extent(0); ++i) { - EXPECT_EQ(view_test_h(i), tmp[i]); + ASSERT_EQ(view_test_h(i), tmp[i]); } } @@ -273,7 +273,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) { KE::unique_copy(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), args...); verify_data(name, view_from, view_dest, args...); - EXPECT_EQ(rit, (KE::begin(view_dest) + n)); + ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } { @@ -283,7 +283,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) { KE::unique_copy("label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), args...); verify_data(name, view_from, view_dest, args...); - EXPECT_EQ(rit, (KE::begin(view_dest) + n)); + ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } { @@ -291,7 +291,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) { create_view(Tag{}, view_ext, "unique_copy_dest"); auto rit = KE::unique_copy(exespace(), view_from, view_dest, args...); verify_data(name, view_from, view_dest, args...); - EXPECT_EQ(rit, (KE::begin(view_dest) + n)); + ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } { @@ -300,7 +300,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) { auto rit = KE::unique_copy("label", exespace(), view_from, view_dest, args...); verify_data(name, view_from, view_dest, args...); - EXPECT_EQ(rit, (KE::begin(view_dest) + n)); + ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } Kokkos::fence(); From 41d9d06269e2115f6477a80ba07ed14e6fdd647e Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Tue, 9 May 2023 10:12:48 -0600 Subject: [PATCH 431/496] Reintroduce test skip for nvhpc < 23.3 --- core/unit_test/TestJoinBackwardCompatibility.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core/unit_test/TestJoinBackwardCompatibility.hpp b/core/unit_test/TestJoinBackwardCompatibility.hpp index 865b71d03f..3ca4421270 100644 --- a/core/unit_test/TestJoinBackwardCompatibility.hpp +++ b/core/unit_test/TestJoinBackwardCompatibility.hpp @@ -21,7 +21,7 @@ // unimplemented reduction features namespace { -// FIXME_NVHPC errors out when using enums here +// FIXME_NVHPC 23.3 errors out when using enums here // NVC++-F-0000-Internal compiler error. process_acc_put_dinit: unexpected // datatype 5339 #ifndef KOKKOS_COMPILER_NVHPC @@ -144,6 +144,11 @@ void test_join_backward_compatibility() { } TEST(TEST_CATEGORY, join_backward_compatibility) { +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_NVHPC) && \ + KOKKOS_COMPILER_NVHPC < \ + 230300 // FIXME_NVHPC test passes with workaround in 23.3 + GTEST_SKIP() << "FIXME wrong result"; +#endif test_join_backward_compatibility(); } From aa7ab5fd59ffed3625eff6e9a0eb11b49d4bc528 Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Tue, 9 May 2023 16:41:18 -0600 Subject: [PATCH 432/496] hpcbind: check for correct Slurm variable --- bin/hpcbind | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/hpcbind b/bin/hpcbind index 32503c7efc..b6db270128 100755 --- a/bin/hpcbind +++ b/bin/hpcbind @@ -105,7 +105,7 @@ elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then HPCBIND_QUEUE_NAME="mvapich2" HPCBIND_QUEUE_RANK=${MV2_COMM_WORLD_RANK} HPCBIND_QUEUE_SIZE=${MV2_COMM_WORLD_SIZE} -elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then +elif [[ ! -z "${SLURM_LOCALID}" ]]; then HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="slurm" HPCBIND_QUEUE_RANK=${SLURM_PROCID} From 531b01dce1ca3ef235c1678f7d4875513ce465e5 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 9 May 2023 22:00:51 -0400 Subject: [PATCH 433/496] Fix macro guards in test for NVC++ as the CUDA compiler --- core/unit_test/TestNumericTraits.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index fc6c6bba6c..f97fe7a699 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -42,7 +42,7 @@ struct extrema { DEFINE_EXTREMA(double, -DBL_MAX, DBL_MAX); // FIXME_NVHPC: with 23.3 using long double in KOKKOS_FUNCTION is hard error -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_COMPILER_NVHPC) +#if !defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC) DEFINE_EXTREMA(long double, -LDBL_MAX, LDBL_MAX); #else static long double min(long double) { return -LDBL_MAX; } From c62a42e1c6d83f0b6ab7a27928185a79445665da Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 10 May 2023 08:47:17 -0400 Subject: [PATCH 434/496] Allow templated functors in parallel_for, parallel_reduce and parallel_scan (#5976) * Allow templated functors in parallel_for, parallel_reduce and parallel_scan * Reorder template arguments for cuda_single_inter_block_reduce_scan_shmem * Add another test to TestFunctorAnalysis.hpp * Document OverrrideValueType some more * Document that reducer functor is templated on purpose * GenericScanFunctor->GenericExclusiveScanFunctor * SizeType->IndexType * Revert unnecessary changes in Test16_ParallelScan.hpp --- .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 15 +++--- core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 44 ++++++++------- core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 10 ++-- core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp | 13 ++--- core/src/Cuda/Kokkos_Cuda_Task.hpp | 6 ++- core/src/Cuda/Kokkos_Cuda_Team.hpp | 15 +++--- core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp | 8 +-- core/src/HIP/Kokkos_HIP_Parallel_Range.hpp | 38 +++++++------ core/src/HIP/Kokkos_HIP_Parallel_Team.hpp | 32 ++++++----- core/src/HIP/Kokkos_HIP_ReduceScan.hpp | 14 ++--- core/src/HIP/Kokkos_HIP_Team.hpp | 13 ++--- core/src/HPX/Kokkos_HPX.hpp | 22 ++++---- core/src/Kokkos_GraphNode.hpp | 3 +- core/src/Kokkos_Parallel.hpp | 3 +- core/src/Kokkos_Parallel_Reduce.hpp | 19 +++---- .../Kokkos_OpenACC_ParallelScan_Range.hpp | 47 +++++++++++----- core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp | 6 +-- ...kkos_OpenMPTarget_ParallelReduce_Range.hpp | 6 +-- ...okkos_OpenMPTarget_ParallelReduce_Team.hpp | 6 +-- ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 2 +- .../Kokkos_OpenMPTarget_ParallelScan_Team.hpp | 4 +- .../Kokkos_OpenMPTarget_Parallel_Common.hpp | 11 ++-- core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp | 14 ++--- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 21 ++++---- core/src/SYCL/Kokkos_SYCL_Team.hpp | 10 ++-- .../Serial/Kokkos_Serial_Parallel_Range.hpp | 6 +-- core/src/Threads/Kokkos_ThreadsTeam.hpp | 8 +-- .../Threads/Kokkos_Threads_Parallel_Range.hpp | 4 +- core/src/impl/Kokkos_FunctorAnalysis.hpp | 18 ++++--- core/src/impl/Kokkos_HostThreadTeam.hpp | 5 +- core/src/impl/Kokkos_Tools_Generic.hpp | 11 ++-- core/unit_test/TestFunctorAnalysis.hpp | 36 +++++++++++-- core/unit_test/hip/TestHIP_ScanUnit.cpp | 3 +- .../Test05_ParallelReduce_RangePolicy.hpp | 7 ++- .../incremental/Test16_ParallelScan.hpp | 53 +++++++++++++++++++ 35 files changed, 339 insertions(+), 194 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index affa9c18a8..8aae27d091 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -321,7 +321,7 @@ class ParallelReduce( + cuda_single_inter_block_reduce_scan_shmem( f, n); using closure_type = Impl::ParallelReduce, @@ -339,8 +339,9 @@ class ParallelReduce>= 1; - shmem_size = cuda_single_inter_block_reduce_scan_shmem(f, n); + shmem_size = + cuda_single_inter_block_reduce_scan_shmem( + f, n); } return n; } @@ -381,8 +382,8 @@ class ParallelReduce( + : cuda_single_inter_block_reduce_scan_shmem( m_functor_reducer.get_functor(), block.y); CudaParallelLaunch( @@ -428,8 +429,8 @@ class ParallelReduce(m_policy, - m_functor_reducer.get_functor()); + check_reduced_view_shmem_size( + m_policy, m_functor_reducer.get_functor()); } }; } // namespace Impl diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 740be29677..5226c48bd9 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -255,7 +255,7 @@ class ParallelReduce, inline unsigned local_block_size(const FunctorType& f) { unsigned n = CudaTraits::WarpSize * 8; int shmem_size = - cuda_single_inter_block_reduce_scan_shmem( + cuda_single_inter_block_reduce_scan_shmem( f, n); using closure_type = Impl::ParallelReduce, @@ -273,8 +273,9 @@ class ParallelReduce, m_policy.space().impl_internal_space_instance(), attr, f, 1, shmem_size, 0)))) { n >>= 1; - shmem_size = cuda_single_inter_block_reduce_scan_shmem(f, n); + shmem_size = + cuda_single_inter_block_reduce_scan_shmem( + f, n); } return n; } @@ -314,8 +315,8 @@ class ParallelReduce, const int shmem = UseShflReduction ? 0 - : cuda_single_inter_block_reduce_scan_shmem( + : cuda_single_inter_block_reduce_scan_shmem( m_functor_reducer.get_functor(), block.y); if ((nwork == 0) @@ -373,8 +374,8 @@ class ParallelReduce, m_scratch_space(nullptr), m_scratch_flags(nullptr), m_unified_space(nullptr) { - check_reduced_view_shmem_size(m_policy, - m_functor_reducer.get_functor()); + check_reduced_view_shmem_size( + m_policy, m_functor_reducer.get_functor()); } }; @@ -390,7 +391,7 @@ class ParallelScan, Kokkos::Cuda> { using LaunchBounds = typename Policy::launch_bounds; using Analysis = Kokkos::Impl::FunctorAnalysis; + Policy, FunctorType, void>; public: using pointer_type = typename Analysis::pointer_type; @@ -609,11 +610,12 @@ class ParallelScan, Kokkos::Cuda> { // testing unsigned n = CudaTraits::WarpSize * 4; - while (n && unsigned(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < - cuda_single_inter_block_reduce_scan_shmem(f, n)) { + while (n && + unsigned(m_policy.space() + .impl_internal_space_instance() + ->m_maxShmemPerBlock) < + cuda_single_inter_block_reduce_scan_shmem(f, n)) { n >>= 1; } return n; @@ -703,8 +705,9 @@ class ParallelScanWithTotal, using WorkRange = typename Policy::WorkRange; using LaunchBounds = typename Policy::launch_bounds; - using Analysis = Kokkos::Impl::FunctorAnalysis; + using Analysis = + Kokkos::Impl::FunctorAnalysis; public: using value_type = typename Analysis::value_type; @@ -931,11 +934,12 @@ class ParallelScanWithTotal, // testing unsigned n = CudaTraits::WarpSize * 4; - while (n && unsigned(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < - cuda_single_inter_block_reduce_scan_shmem(f, n)) { + while (n && + unsigned(m_policy.space() + .impl_internal_space_instance() + ->m_maxShmemPerBlock) < + cuda_single_inter_block_reduce_scan_shmem(f, n)) { n >>= 1; } return n; diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 74fe87a65a..c5a211a807 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -114,7 +114,7 @@ class TeamPolicyInternal const ParallelReduceTag&) const { using functor_analysis_type = Impl::FunctorAnalysis; + TeamPolicyInternal, FunctorType, void>; using closure_type = Impl::ParallelReduce< CombinedFunctorReducer, @@ -153,7 +153,7 @@ class TeamPolicyInternal const ParallelReduceTag&) const { using functor_analysis_type = Impl::FunctorAnalysis; + TeamPolicyInternal, FunctorType, void>; using closure_type = Impl::ParallelReduce< CombinedFunctorReducer, @@ -365,7 +365,7 @@ class TeamPolicyInternal typename Impl::DeduceFunctorPatternInterface::type; using Analysis = Impl::FunctorAnalysis; + FunctorType, void>; cudaFuncAttributes attr = CudaParallelLaunch:: @@ -893,8 +893,8 @@ class ParallelReduce( + : cuda_single_inter_block_reduce_scan_shmem( arg_functor_reducer.get_functor(), m_team_size); m_shmem_begin = sizeof(double) * (m_team_size + 2); m_shmem_size = m_policy.scratch_size(0, m_team_size) + diff --git a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 59fdd13513..2cdc291358 100644 --- a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -672,34 +672,35 @@ __device__ bool cuda_single_inter_block_reduce_scan( } // Size in bytes required for inter block reduce or scan -template +template inline std::enable_if_t cuda_single_inter_block_reduce_scan_shmem(const FunctorType& functor, const unsigned BlockSize) { using Analysis = Impl::FunctorAnalysis, FunctorType>; + RangePolicy, FunctorType, ValueType>; return (BlockSize + 2) * Analysis::value_size(functor); } -template +template inline std::enable_if_t cuda_single_inter_block_reduce_scan_shmem(const FunctorType& functor, const unsigned BlockSize) { using Analysis = Impl::FunctorAnalysis, FunctorType>; + RangePolicy, FunctorType, ValueType>; return (BlockSize + 2) * Analysis::value_size(functor); } -template +template inline void check_reduced_view_shmem_size(const Policy& policy, const FunctorType& functor) { size_t minBlockSize = CudaTraits::WarpSize * 1; unsigned reqShmemSize = - cuda_single_inter_block_reduce_scan_shmem( + cuda_single_inter_block_reduce_scan_shmem( functor, minBlockSize); size_t maxShmemPerBlock = policy.space().impl_internal_space_instance()->m_maxShmemPerBlock; diff --git a/core/src/Cuda/Kokkos_Cuda_Task.hpp b/core/src/Cuda/Kokkos_Cuda_Task.hpp index 76e4122af2..d579214cd4 100644 --- a/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -1042,7 +1042,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( // Extract value_type from closure using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure, + void>::value_type; if (1 < loop_boundaries.thread.team_size()) { // make sure all threads perform all loop iterations @@ -1107,7 +1108,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( // Extract value_type from closure using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure, + void>::value_type; if (1 < loop_boundaries.thread.team_size()) { // make sure all threads perform all loop iterations diff --git a/core/src/Cuda/Kokkos_Cuda_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Team.hpp index af47dfff92..1d82704e65 100644 --- a/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -196,8 +196,9 @@ class CudaTeamMember { (void)reducer; (void)value; KOKKOS_IF_ON_DEVICE( - (typename Impl::FunctorAnalysis, ReducerType>::Reducer + (typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, typename ReducerType::value_type>::Reducer wrapped_reducer(reducer); cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y); reducer.reference() = value;)) @@ -228,7 +229,8 @@ class CudaTeamMember { Impl::CudaJoinFunctor cuda_join_functor; typename Impl::FunctorAnalysis< Impl::FunctorPatternInterface::SCAN, TeamPolicy, - Impl::CudaJoinFunctor>::Reducer reducer(cuda_join_functor); + Impl::CudaJoinFunctor, Type>::Reducer + reducer(cuda_join_functor); Impl::cuda_intra_block_reduce_scan(reducer, base_data + 1); if (global_accum) { @@ -688,8 +690,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const FunctorType& lambda) { // Extract value_type from lambda using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, - FunctorType>::value_type; + Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType, + void>::value_type; const auto start = loop_bounds.start; const auto end = loop_bounds.end; @@ -825,7 +827,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( loop_boundaries, const Closure& closure) { using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure, + void>::value_type; value_type dummy; parallel_scan(loop_boundaries, closure, Kokkos::Sum(dummy)); } diff --git a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp index aed177b128..10da8a051b 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp @@ -281,8 +281,8 @@ class ParallelReduce(f, n); + return hip_single_inter_block_reduce_scan_shmem(f, n); }; unsigned block_size = @@ -331,8 +331,8 @@ class ParallelReduce(m_functor_reducer.get_functor(), - block.y); + false, WorkTag, value_type>(m_functor_reducer.get_functor(), + block.y); hip_parallel_launch( *this, grid, block, shmem, diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index 442e794aa9..8835e813d1 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -270,8 +270,8 @@ class ParallelReduce, inline unsigned local_block_size(const FunctorType& f) { const auto& instance = m_policy.space().impl_internal_space_instance(); auto shmem_functor = [&f](unsigned n) { - return hip_single_inter_block_reduce_scan_shmem(f, n); + return hip_single_inter_block_reduce_scan_shmem(f, n); }; return Kokkos::Impl::hip_get_preferred_blocksize( @@ -314,8 +314,8 @@ class ParallelReduce, const int shmem = UseShflReduction ? 0 - : hip_single_inter_block_reduce_scan_shmem( + : hip_single_inter_block_reduce_scan_shmem( m_functor_reducer.get_functor(), block.y); Kokkos::Impl::hip_parallel_launch( @@ -349,7 +349,7 @@ class ParallelReduce, typename ViewType::memory_space>::accessible) {} }; -template +template class ParallelScanHIPBase { public: using Policy = Kokkos::RangePolicy; @@ -360,8 +360,9 @@ class ParallelScanHIPBase { using WorkRange = typename Policy::WorkRange; using LaunchBounds = typename Policy::launch_bounds; - using Analysis = Kokkos::Impl::FunctorAnalysis; + using Analysis = + Kokkos::Impl::FunctorAnalysis; public: using value_type = typename Analysis::value_type; @@ -589,14 +590,13 @@ class ParallelScanHIPBase { m_final = false; // these ones are OK to be just the base because the specializations // do not modify the kernel at all - using DriverType = ParallelScanHIPBase; - Impl::hip_parallel_launch( + Impl::hip_parallel_launch( *this, grid, block, shmem, m_policy.space().impl_internal_space_instance(), false); // copy to device and execute m_final = true; - Impl::hip_parallel_launch( + Impl::hip_parallel_launch( *this, grid, block, shmem, m_policy.space().impl_internal_space_instance(), false); // copy to device and execute @@ -614,9 +614,9 @@ class ParallelScanHIPBase { template class ParallelScan, HIP> - : public ParallelScanHIPBase { + : public ParallelScanHIPBase { public: - using Base = ParallelScanHIPBase; + using Base = ParallelScanHIPBase; using Base::operator(); inline void execute() { @@ -642,9 +642,8 @@ class ParallelScan, HIP> const auto& instance = Base::m_policy.space().impl_internal_space_instance(); auto shmem_functor = [&f](unsigned n) { - return hip_single_inter_block_reduce_scan_shmem( - f, n); + return hip_single_inter_block_reduce_scan_shmem< + true, typename Base::WorkTag, void>(f, n); }; using DriverType = ParallelScan; return Impl::hip_get_preferred_blocksize, HIP> template class ParallelScanWithTotal, ReturnType, HIP> - : public ParallelScanHIPBase { + : public ParallelScanHIPBase { public: - using Base = ParallelScanHIPBase; + using Base = ParallelScanHIPBase; using Base::operator(); inline void execute() { @@ -701,9 +700,8 @@ class ParallelScanWithTotal, const auto& instance = Base::m_policy.space().impl_internal_space_instance(); auto shmem_functor = [&f](unsigned n) { - return hip_single_inter_block_reduce_scan_shmem( - f, n); + return hip_single_inter_block_reduce_scan_shmem< + true, typename Base::WorkTag, ReturnType>(f, n); }; using DriverType = ParallelScanWithTotal; diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index 9f725d28d2..0d75c3ca86 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -74,7 +74,7 @@ class TeamPolicyInternal using closure_type = Impl::ParallelFor>; - return internal_team_size_common(f); + return internal_team_size_common(f); } template @@ -82,12 +82,14 @@ class TeamPolicyInternal const ParallelReduceTag&) const { using functor_analysis_type = Impl::FunctorAnalysis; + TeamPolicyInternal, FunctorType, void>; using closure_type = Impl::ParallelReduce< CombinedFunctorReducer, TeamPolicy, Kokkos::HIP>; - return internal_team_size_common(f); + return internal_team_size_common< + BlockType::Max, closure_type, + typename functor_analysis_type::value_type>(f); } template @@ -96,7 +98,8 @@ class TeamPolicyInternal using closure_type = Impl::ParallelReduce, TeamPolicy, Kokkos::HIP>; - return internal_team_size_common(f); + return internal_team_size_common(f); } template @@ -104,7 +107,8 @@ class TeamPolicyInternal using closure_type = Impl::ParallelFor>; - return internal_team_size_common(f); + return internal_team_size_common( + f); } template @@ -112,12 +116,14 @@ class TeamPolicyInternal ParallelReduceTag const&) const { using functor_analysis_type = Impl::FunctorAnalysis; + TeamPolicyInternal, FunctorType, void>; using closure_type = Impl::ParallelReduce< CombinedFunctorReducer, TeamPolicy, Kokkos::HIP>; - return internal_team_size_common(f); + return internal_team_size_common< + BlockType::Preferred, closure_type, + typename functor_analysis_type::value_type>(f); } template @@ -126,7 +132,8 @@ class TeamPolicyInternal using closure_type = Impl::ParallelReduce, TeamPolicy, Kokkos::HIP>; - return internal_team_size_common(f); + return internal_team_size_common(f); } inline bool impl_auto_vector_length() const { return m_tune_vector_length; } @@ -325,7 +332,8 @@ class TeamPolicyInternal using member_type = Kokkos::Impl::HIPTeamMember; protected: - template + template int internal_team_size_common(FunctorType const& f) const { const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); unsigned shmem_thread = thread_scratch_size(0) + sizeof(double); @@ -335,7 +343,7 @@ class TeamPolicyInternal typename Impl::DeduceFunctorPatternInterface::type; using Analysis = Impl::FunctorAnalysis; + FunctorType, ValueType>; shmem_thread += ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f)); } @@ -813,8 +821,8 @@ class ParallelReduce( + : hip_single_inter_block_reduce_scan_shmem( arg_functor_reducer.get_functor(), m_team_size); m_shmem_begin = sizeof(double) * (m_team_size + 2); m_shmem_size = m_policy.scratch_size(0, m_team_size) + diff --git a/core/src/HIP/Kokkos_HIP_ReduceScan.hpp b/core/src/HIP/Kokkos_HIP_ReduceScan.hpp index 9de26b63a7..a533d41537 100644 --- a/core/src/HIP/Kokkos_HIP_ReduceScan.hpp +++ b/core/src/HIP/Kokkos_HIP_ReduceScan.hpp @@ -477,22 +477,24 @@ __device__ bool hip_single_inter_block_reduce_scan( } // Size in bytes required for inter block reduce or scan -template +template inline std::enable_if_t hip_single_inter_block_reduce_scan_shmem(const FunctorType& functor, const unsigned BlockSize) { - using Analysis = Impl::FunctorAnalysis, FunctorType>; + using Analysis = + Impl::FunctorAnalysis, FunctorType, ValueType>; return (BlockSize + 2) * Analysis::value_size(functor); } -template +template inline std::enable_if_t hip_single_inter_block_reduce_scan_shmem(const FunctorType& functor, const unsigned BlockSize) { - using Analysis = Impl::FunctorAnalysis, FunctorType>; + using Analysis = + Impl::FunctorAnalysis, FunctorType, ValueType>; return (BlockSize + 2) * Analysis::value_size(functor); } diff --git a/core/src/HIP/Kokkos_HIP_Team.hpp b/core/src/HIP/Kokkos_HIP_Team.hpp index 584d728d94..2fed2b7719 100644 --- a/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_Team.hpp @@ -181,8 +181,8 @@ class HIPTeamMember { typename ReducerType::value_type& value) const noexcept { #ifdef __HIP_DEVICE_COMPILE__ typename Kokkos::Impl::FunctorAnalysis< - FunctorPatternInterface::REDUCE, TeamPolicy, ReducerType>::Reducer - wrapped_reducer(reducer); + FunctorPatternInterface::REDUCE, TeamPolicy, ReducerType, + typename ReducerType::value_type>::Reducer wrapped_reducer(reducer); hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); reducer.reference() = value; #else @@ -219,7 +219,7 @@ class HIPTeamMember { Impl::HIPJoinFunctor hip_join_functor; typename Kokkos::Impl::FunctorAnalysis< FunctorPatternInterface::REDUCE, TeamPolicy, - Impl::HIPJoinFunctor>::Reducer reducer(hip_join_functor); + Impl::HIPJoinFunctor, Type>::Reducer reducer(hip_join_functor); Impl::hip_intra_block_reduce_scan(reducer, base_data + 1); if (global_accum) { @@ -544,8 +544,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const FunctorType& lambda) { // Extract value_type from lambda using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, - FunctorType>::value_type; + Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType, + void>::value_type; const auto start = loop_bounds.start; const auto end = loop_bounds.end; @@ -824,7 +824,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( loop_boundaries, const Closure& closure) { using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure, + void>::value_type; value_type dummy; parallel_scan(loop_boundaries, closure, Kokkos::Sum(dummy)); } diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp index 06eed664df..613129809e 100644 --- a/core/src/HPX/Kokkos_HPX.hpp +++ b/core/src/HPX/Kokkos_HPX.hpp @@ -1213,7 +1213,7 @@ class ParallelScan, using WorkRange = typename Policy::WorkRange; using Member = typename Policy::member_type; using Analysis = - FunctorAnalysis; + FunctorAnalysis; using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; using value_type = typename Analysis::value_type; @@ -1310,12 +1310,12 @@ template class ParallelScanWithTotal, ReturnType, Kokkos::Experimental::HPX> { private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; - using Analysis = - FunctorAnalysis; + using Policy = Kokkos::RangePolicy; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + using Analysis = FunctorAnalysis; using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; using value_type = typename Analysis::value_type; @@ -1777,8 +1777,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( &loop_boundaries, const FunctorType &lambda) { using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, - FunctorType>::value_type; + Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType, + void>::value_type; value_type scan_val = value_type(); @@ -1815,8 +1815,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const FunctorType &lambda) { using value_type = typename Impl::FunctorAnalysis, - FunctorType>::value_type; + TeamPolicy, FunctorType, + void>::value_type; value_type scan_val = value_type(); diff --git a/core/src/Kokkos_GraphNode.hpp b/core/src/Kokkos_GraphNode.hpp index c35fe30e76..2a4e2cf641 100644 --- a/core/src/Kokkos_GraphNode.hpp +++ b/core/src/Kokkos_GraphNode.hpp @@ -383,7 +383,8 @@ class GraphNodeRef { passed_reducer_type>; using analysis = Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, - typename reducer_selector::type>; + typename reducer_selector::type, + typename return_value_adapter::value_type>; typename analysis::Reducer final_reducer( reducer_selector::select(functor, return_value)); Kokkos::Impl::CombinedFunctorReducer::value) { Kokkos::Impl::shared_allocation_tracking_disable(); - Impl::ParallelScanWithTotal + Impl::ParallelScanWithTotal closure(functor, inner_policy, return_value); Kokkos::Impl::shared_allocation_tracking_enable(); closure.execute(); diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index fa4b401ce0..17f8bf1817 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -1613,9 +1613,9 @@ struct ParallelReduceAdaptor { using ReducerSelector = Kokkos::Impl::if_c::value, FunctorType, PassedReducerType>; - using Analysis = - FunctorAnalysis; + using Analysis = FunctorAnalysis; Kokkos::Impl::shared_allocation_tracking_disable(); CombinedFunctorReducer functor_reducer( functor, typename Analysis::Reducer( @@ -1635,8 +1635,9 @@ struct ParallelReduceAdaptor { } static constexpr bool is_array_reduction = - Impl::FunctorAnalysis::StaticValueSize == 0; + Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, PolicyType, FunctorType, + typename return_value_adapter::value_type>::StaticValueSize == 0; template static inline std::enable_if_t; + FunctorType, void>; using value_type = std::conditional_t<(FunctorAnalysis::StaticValueSize != 0), typename FunctorAnalysis::value_type, typename FunctorAnalysis::pointer_type>; @@ -1961,7 +1962,7 @@ inline void parallel_reduce( nullptr) { using FunctorAnalysis = Impl::FunctorAnalysis; + FunctorType, void>; using value_type = std::conditional_t<(FunctorAnalysis::StaticValueSize != 0), typename FunctorAnalysis::value_type, typename FunctorAnalysis::pointer_type>; @@ -1986,7 +1987,7 @@ inline void parallel_reduce(const size_t& policy, const FunctorType& functor) { FunctorType>::policy_type; using FunctorAnalysis = Impl::FunctorAnalysis; + FunctorType, void>; using value_type = std::conditional_t<(FunctorAnalysis::StaticValueSize != 0), typename FunctorAnalysis::value_type, typename FunctorAnalysis::pointer_type>; @@ -2013,7 +2014,7 @@ inline void parallel_reduce(const std::string& label, const size_t& policy, FunctorType>::policy_type; using FunctorAnalysis = Impl::FunctorAnalysis; + FunctorType, void>; using value_type = std::conditional_t<(FunctorAnalysis::StaticValueSize != 0), typename FunctorAnalysis::value_type, typename FunctorAnalysis::pointer_type>; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp index 56f9db0db8..00b0a5d25c 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp @@ -21,14 +21,15 @@ #include #include -template -class Kokkos::Impl::ParallelScan, - Kokkos::Experimental::OpenACC> { +namespace Kokkos::Impl { + +template +class ParallelScanOpenACCBase { protected: using Policy = Kokkos::RangePolicy; using Analysis = Kokkos::Impl::FunctorAnalysis; + Policy, Functor, GivenValueType>; using PointerType = typename Analysis::pointer_type; using ValueType = typename Analysis::value_type; using MemberType = typename Policy::member_type; @@ -40,9 +41,9 @@ class Kokkos::Impl::ParallelScan, static constexpr MemberType default_scan_chunk_size = 128; public: - ParallelScan(Functor const& arg_functor, Policy const& arg_policy, - ValueType* arg_result_ptr = nullptr, - bool arg_result_ptr_device_accessible = false) + ParallelScanOpenACCBase(Functor const& arg_functor, Policy const& arg_policy, + ValueType* arg_result_ptr, + bool arg_result_ptr_device_accessible) : m_functor(arg_functor), m_policy(arg_policy), m_result_ptr(arg_result_ptr), @@ -219,18 +220,40 @@ class Kokkos::Impl::ParallelScan, } }; +} // namespace Kokkos::Impl + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +template +class Kokkos::Impl::ParallelScan, + Kokkos::Experimental::OpenACC> + : public ParallelScanOpenACCBase { + using base_t = ParallelScanOpenACCBase; + using IndexType = typename base_t::IndexType; + + public: + void execute() const { + const IndexType begin = base_t::m_policy.begin(); + const IndexType end = base_t::m_policy.end(); + IndexType chunk_size = base_t::m_policy.chunk_size(); + + int const async_arg = base_t::m_policy.space().acc_async_queue(); + + OpenACCParallelScanRangePolicy(begin, end, chunk_size, async_arg); + } + + ParallelScan(const Functor& arg_functor, + const typename base_t::Policy& arg_policy) + : base_t(arg_functor, arg_policy, nullptr, false) {} +}; + template class Kokkos::Impl::ParallelScanWithTotal< FunctorType, Kokkos::RangePolicy, ReturnType, Kokkos::Experimental::OpenACC> - : public ParallelScan, - Kokkos::Experimental::OpenACC> { - using base_t = ParallelScan, - Kokkos::Experimental::OpenACC>; - using ValueType = typename base_t::ValueType; + : public ParallelScanOpenACCBase { + using base_t = ParallelScanOpenACCBase; using IndexType = typename base_t::IndexType; public: diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp index c3dfb69f59..3bfd6dea50 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp @@ -622,7 +622,7 @@ class ParallelScan, using Policy = Kokkos::RangePolicy; using Analysis = - FunctorAnalysis; + FunctorAnalysis; using WorkTag = typename Policy::work_tag; using WorkRange = typename Policy::WorkRange; @@ -749,8 +749,8 @@ class ParallelScanWithTotal, private: using Policy = Kokkos::RangePolicy; - using Analysis = - FunctorAnalysis; + using Analysis = FunctorAnalysis; using WorkTag = typename Policy::work_tag; using WorkRange = typename Policy::WorkRange; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index dbdb2826c9..4452af3846 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -38,9 +38,9 @@ class ParallelReduce, using pointer_type = typename ReducerType::pointer_type; using reference_type = typename ReducerType::reference_type; - static constexpr bool FunctorHasJoin = - Impl::FunctorAnalysis::Reducer::has_join_member_function(); + static constexpr bool FunctorHasJoin = Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, Policy, FunctorType, + typename ReducerType::value_type>::Reducer::has_join_member_function(); static constexpr bool UseReducer = !std::is_same_v; static constexpr bool IsArray = std::is_pointer_v; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index 82d5607cd9..a302fa7151 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -453,9 +453,9 @@ class ParallelReduce::Reducer::has_join_member_function(); + static constexpr bool FunctorHasJoin = Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, Policy, FunctorType, + typename ReducerType::value_type>::Reducer::has_join_member_function(); static constexpr bool UseReducer = !std::is_same_v; static constexpr bool IsArray = std::is_pointer_v; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index f95a4610d9..22acb7e3c5 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -35,7 +35,7 @@ class ParallelScan, using idx_type = typename Policy::index_type; using Analysis = Impl::FunctorAnalysis; + Policy, FunctorType, void>; using value_type = typename Analysis::value_type; using pointer_type = typename Analysis::pointer_type; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp index 65002c1830..a9e24994e0 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp @@ -39,7 +39,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const FunctorType& lambda) { using Analysis = Impl::FunctorAnalysis, - FunctorType>; + FunctorType, void>; using value_type = typename Analysis::value_type; const auto start = loop_bounds.start; @@ -107,7 +107,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const FunctorType& lambda) { using Analysis = Impl::FunctorAnalysis, - FunctorType>; + FunctorType, void>; using value_type = typename Analysis::value_type; value_type scan_val = value_type(); diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp index ceb1337c58..944592d125 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp @@ -51,7 +51,8 @@ struct ParallelReduceSpecialize { PointerType /*result_ptr*/) { constexpr int FunctorHasJoin = Impl::FunctorAnalysis::Reducer::has_join_member_function(); + FunctorType, + ValueType>::Reducer::has_join_member_function(); constexpr int UseReducerType = is_reducer_v; std::stringstream error_message; @@ -72,7 +73,7 @@ struct ParallelReduceSpecialize, std::conditional_t::value, FunctorType, ReducerType>; using Analysis = Impl::FunctorAnalysis; + PolicyType, ReducerTypeFwd, ValueType>; using ReferenceType = typename Analysis::reference_type; using ParReduceCopy = ParallelReduceCopy; @@ -198,7 +199,7 @@ struct ParallelReduceSpecialize, using FunctorAnalysis = Impl::FunctorAnalysis; + FunctorType, ValueType>; // Initialize the result pointer. @@ -330,7 +331,7 @@ struct ParallelReduceSpecialize, std::conditional_t::value, FunctorType, ReducerType>; using Analysis = Impl::FunctorAnalysis; + PolicyType, ReducerTypeFwd, ValueType>; using ReferenceType = typename Analysis::reference_type; @@ -540,7 +541,7 @@ struct ParallelReduceSpecialize, PointerType ptr, const bool ptr_on_device) { using FunctorAnalysis = Impl::FunctorAnalysis; + FunctorType, ValueType>; const int league_size = p.league_size(); const int team_size = p.team_size(); diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp index 237ba47be3..f22c8ada02 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp @@ -89,7 +89,7 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, final_reducer.join(&local_value, &local_mem[sg_group_id - 1]); } -template +template class ParallelScanSYCLBase { public: using Policy = Kokkos::RangePolicy; @@ -100,8 +100,8 @@ class ParallelScanSYCLBase { using LaunchBounds = typename Policy::launch_bounds; public: - using Analysis = - FunctorAnalysis; + using Analysis = FunctorAnalysis; using pointer_type = typename Analysis::pointer_type; using value_type = typename Analysis::value_type; using reference_type = typename Analysis::reference_type; @@ -353,9 +353,9 @@ class ParallelScanSYCLBase { template class ParallelScan, Kokkos::Experimental::SYCL> - : private ParallelScanSYCLBase { + : private ParallelScanSYCLBase { public: - using Base = ParallelScanSYCLBase; + using Base = ParallelScanSYCLBase; inline void execute() { Base::impl_execute([]() {}); @@ -371,9 +371,9 @@ class ParallelScan, template class ParallelScanWithTotal, ReturnType, Kokkos::Experimental::SYCL> - : public ParallelScanSYCLBase { + : public ParallelScanSYCLBase { public: - using Base = ParallelScanSYCLBase; + using Base = ParallelScanSYCLBase; const Kokkos::Experimental::SYCL& m_exec; diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp index aad95a1546..f03dd564c6 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -74,13 +74,13 @@ class TeamPolicyInternal template inline int team_size_max(const FunctorType& f, const ParallelReduceTag&) const { - return internal_team_size_max_reduce(f); + return internal_team_size_max_reduce(f); } template inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/, const ParallelReduceTag&) const { - return internal_team_size_max_reduce(f); + return internal_team_size_max_reduce(f); } template @@ -91,13 +91,14 @@ class TeamPolicyInternal template inline int team_size_recommended(FunctorType const& f, ParallelReduceTag const&) const { - return internal_team_size_recommended_reduce(f); + return internal_team_size_recommended_reduce(f); } template int team_size_recommended(FunctorType const& f, ReducerType const&, ParallelReduceTag const&) const { - return internal_team_size_recommended_reduce(f); + return internal_team_size_recommended_reduce< + typename ReducerType::value_type>(f); } inline bool impl_auto_vector_length() const { return m_tune_vector_length; } inline bool impl_auto_team_size() const { return m_tune_team_size; } @@ -312,10 +313,11 @@ class TeamPolicyInternal impl_vector_length(); } - template + template int internal_team_size_max_reduce(const FunctorType& f) const { - using Analysis = FunctorAnalysis; + using Analysis = + FunctorAnalysis; using value_type = typename Analysis::value_type; const int value_count = Analysis::value_count(f); @@ -348,10 +350,11 @@ class TeamPolicyInternal return 1 << Kokkos::Impl::int_log2(internal_team_size_max_for(f)); } - template + template int internal_team_size_recommended_reduce(const FunctorType& f) const { // FIXME_SYCL improve - return 1 << Kokkos::Impl::int_log2(internal_team_size_max_reduce(f)); + return 1 << Kokkos::Impl::int_log2( + internal_team_size_max_reduce(f)); } }; diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index 674037ed95..797f3c752f 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -579,8 +579,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const FunctorType& lambda) { // Extract value_type from lambda using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, - FunctorType>::value_type; + Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType, + void>::value_type; const auto start = loop_bounds.start; const auto end = loop_bounds.end; @@ -775,7 +775,8 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure, + void>::value_type; value_type accum; reducer.init(accum); @@ -844,7 +845,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( loop_boundaries, const Closure& closure) { using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure, + void>::value_type; value_type dummy; parallel_scan(loop_boundaries, closure, Kokkos::Sum{dummy}); } diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index a8dfae109f..56894716db 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -150,7 +150,7 @@ class ParallelScan, using WorkTag = typename Policy::work_tag; using Analysis = - FunctorAnalysis; + FunctorAnalysis; using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; @@ -214,8 +214,8 @@ class ParallelScanWithTotal, using Policy = Kokkos::RangePolicy; using WorkTag = typename Policy::work_tag; - using Analysis = - FunctorAnalysis; + using Analysis = FunctorAnalysis; using value_type = typename Analysis::value_type; using pointer_type = typename Analysis::pointer_type; diff --git a/core/src/Threads/Kokkos_ThreadsTeam.hpp b/core/src/Threads/Kokkos_ThreadsTeam.hpp index 3f734f08d4..788b6366db 100644 --- a/core/src/Threads/Kokkos_ThreadsTeam.hpp +++ b/core/src/Threads/Kokkos_ThreadsTeam.hpp @@ -982,8 +982,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( iType, Impl::ThreadsExecTeamMember>& loop_bounds, const FunctorType& lambda) { using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, - FunctorType>::value_type; + Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType, + void>::value_type; auto scan_val = value_type{}; @@ -1027,8 +1027,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const FunctorType& lambda) { using value_type = typename Impl::FunctorAnalysis, - FunctorType>::value_type; + TeamPolicy, FunctorType, + void>::value_type; value_type scan_val = value_type(); diff --git a/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp b/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp index 7aefe4f13f..f8c2867739 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp +++ b/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp @@ -268,7 +268,7 @@ class ParallelScan, using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using Analysis = Impl::FunctorAnalysis; + Policy, FunctorType, void>; using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; @@ -345,7 +345,7 @@ class ParallelScanWithTotal, using Member = typename Policy::member_type; using Analysis = Impl::FunctorAnalysis; + Policy, FunctorType, ReturnType>; using value_type = typename Analysis::value_type; using pointer_type = typename Analysis::pointer_type; diff --git a/core/src/impl/Kokkos_FunctorAnalysis.hpp b/core/src/impl/Kokkos_FunctorAnalysis.hpp index 4bd6c79c82..eeccd48f6b 100644 --- a/core/src/impl/Kokkos_FunctorAnalysis.hpp +++ b/core/src/impl/Kokkos_FunctorAnalysis.hpp @@ -64,14 +64,15 @@ struct DeduceFunctorPatternInterface +template struct FunctorAnalysis { private: using FOR = FunctorPatternInterface::FOR; @@ -124,9 +125,10 @@ struct FunctorAnalysis { //---------------------------------------- // Check for Functor::value_type, which is either a simple type T or T[] + // If the functor doesn't have a value_type alias, use OverrideValueType. template struct has_value_type { - using type = void; + using type = OverrideValueType; }; template @@ -141,9 +143,9 @@ struct FunctorAnalysis { }; //---------------------------------------- - // If Functor::value_type does not exist then evaluate operator(), - // depending upon the pattern and whether the policy has a work tag, - // to determine the reduction or scan value_type. + // If Functor::value_type does not exist and OverrideValueType is void, then + // evaluate operator(), depending upon the pattern and whether the policy has + // a work tag, to determine the reduction or scan value_type. template ::type, diff --git a/core/src/impl/Kokkos_HostThreadTeam.hpp b/core/src/impl/Kokkos_HostThreadTeam.hpp index 35ced1b56c..94db1f49bb 100644 --- a/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -873,7 +873,8 @@ KOKKOS_INLINE_FUNCTION // Extract ValueType from the closure using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure, + void>::value_type; value_type accum = 0; @@ -899,7 +900,7 @@ KOKKOS_INLINE_FUNCTION loop_boundaries, ClosureType const& closure) { using value_type = typename Kokkos::Impl::FunctorAnalysis< - Impl::FunctorPatternInterface::SCAN, void, ClosureType>::value_type; + Impl::FunctorPatternInterface::SCAN, void, ClosureType, void>::value_type; value_type scan_val = value_type(); diff --git a/core/src/impl/Kokkos_Tools_Generic.hpp b/core/src/impl/Kokkos_Tools_Generic.hpp index 3d88da8f02..b6643e064c 100644 --- a/core/src/impl/Kokkos_Tools_Generic.hpp +++ b/core/src/impl/Kokkos_Tools_Generic.hpp @@ -101,7 +101,7 @@ struct SimpleTeamSizeCalculator { const Kokkos::ParallelReduceTag&) { using exec_space = typename Policy::execution_space; using analysis = Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, Functor>; + Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, Functor, void>; using driver = typename Kokkos::Impl::ParallelReduceWrapper< Kokkos::Impl::CombinedFunctorReducer, @@ -126,7 +126,8 @@ struct ComplexReducerSizeCalculator { ReducerType reducer_example = ReducerType(value); using Analysis = Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType>; + Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType, + value_type>; typename Analysis::Reducer final_reducer(reducer_example); return policy.team_size_max(functor, final_reducer, tag); @@ -139,7 +140,8 @@ struct ComplexReducerSizeCalculator { ReducerType reducer_example = ReducerType(value); using Analysis = Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType>; + Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType, + value_type>; typename Analysis::Reducer final_reducer(reducer_example); return policy.team_size_recommended(functor, final_reducer, tag); @@ -150,7 +152,8 @@ struct ComplexReducerSizeCalculator { const Kokkos::ParallelReduceTag&) { using exec_space = typename Policy::execution_space; using Analysis = Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType>; + Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType, + void>; using driver = typename Kokkos::Impl::ParallelReduceWrapper< Kokkos::Impl::CombinedFunctorReducer, diff --git a/core/unit_test/TestFunctorAnalysis.hpp b/core/unit_test/TestFunctorAnalysis.hpp index 414f1e5d37..c024526111 100644 --- a/core/unit_test/TestFunctorAnalysis.hpp +++ b/core/unit_test/TestFunctorAnalysis.hpp @@ -38,6 +38,16 @@ struct TestFunctorAnalysis_03 { KOKKOS_INLINE_FUNCTION static void init(value_type&) {} }; +struct TestFunctorAnalysis_04 { + KOKKOS_INLINE_FUNCTION + void operator()(int, float&) const {} + + KOKKOS_INLINE_FUNCTION + void join(float&, float const&) const {} + + KOKKOS_INLINE_FUNCTION static void init(float&) {} +}; + template void test_functor_analysis() { //------------------------------ @@ -45,7 +55,7 @@ void test_functor_analysis() { using A01 = Kokkos::Impl::FunctorAnalysis, - decltype(c01)>; + decltype(c01), void>; using R01 = typename A01::Reducer; @@ -65,7 +75,7 @@ void test_functor_analysis() { auto c02 = KOKKOS_LAMBDA(int, double&){}; using A02 = Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::REDUCE, - Kokkos::RangePolicy, decltype(c02)>; + Kokkos::RangePolicy, decltype(c02), void>; using R02 = typename A02::Reducer; static_assert(std::is_same::value, ""); @@ -85,7 +95,7 @@ void test_functor_analysis() { TestFunctorAnalysis_03 c03; using A03 = Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::REDUCE, - Kokkos::RangePolicy, TestFunctorAnalysis_03>; + Kokkos::RangePolicy, TestFunctorAnalysis_03, void>; using R03 = typename A03::Reducer; static_assert(std::is_same, TestFunctorAnalysis_04, float>; + using R04 = typename A04::Reducer; + + static_assert(std::is_same_v); + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + + static_assert(A04::has_join_member_function); + static_assert(A04::has_init_member_function); + static_assert(!A04::has_final_member_function); + static_assert(A04::StaticValueSize == sizeof(typename A04::value_type)); + ASSERT_EQ(R04(c04).length(), 1); } TEST(TEST_CATEGORY, functor_analysis) { diff --git a/core/unit_test/hip/TestHIP_ScanUnit.cpp b/core/unit_test/hip/TestHIP_ScanUnit.cpp index 23c287635d..fe3a14d2b8 100644 --- a/core/unit_test/hip/TestHIP_ScanUnit.cpp +++ b/core/unit_test/hip/TestHIP_ScanUnit.cpp @@ -33,7 +33,8 @@ __global__ void start_intra_block_scan() DummyFunctor f; typename Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::SCAN, - Kokkos::RangePolicy, DummyFunctor>::Reducer reducer(f); + Kokkos::RangePolicy, DummyFunctor, + DummyFunctor::value_type>::Reducer reducer(f); Kokkos::Impl::hip_intra_block_reduce_scan(reducer, values); __syncthreads(); diff --git a/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp b/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp index 0f56c395a9..ed22c22d70 100644 --- a/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp +++ b/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp @@ -28,8 +28,11 @@ using value_type = double; constexpr double value = 0.5; struct ReduceFunctor { - KOKKOS_INLINE_FUNCTION - void operator()(const int i, double &UpdateSum) const { + // The functor is templated on purpose to check that the value_type deduction + // in parallel_reduce even works in this case. + template + KOKKOS_INLINE_FUNCTION void operator()(const IndexType i, + ValueType &UpdateSum) const { UpdateSum += (i + 1) * value; } }; diff --git a/core/unit_test/incremental/Test16_ParallelScan.hpp b/core/unit_test/incremental/Test16_ParallelScan.hpp index 7fc74f9ce7..efcb19a5c6 100644 --- a/core/unit_test/incremental/Test16_ParallelScan.hpp +++ b/core/unit_test/incremental/Test16_ParallelScan.hpp @@ -61,6 +61,19 @@ struct NonTrivialScanFunctor { ~NonTrivialScanFunctor() {} }; +template +struct GenericExclusiveScanFunctor { + Kokkos::View d_data; + + template + KOKKOS_FUNCTION void operator()(const IndexType i, ValueType &update_value, + const bool final) const { + const ValueType val_i = d_data(i); + if (final) d_data(i) = update_value; + update_value += val_i; + } +}; + template struct TestScan { // 1D View of double @@ -93,10 +106,50 @@ struct TestScan { } }; +template +struct TestScanWithTotal { + // 1D View of double + using View_1D = typename Kokkos::View; + View_1D d_data = View_1D("data", N); + + template + KOKKOS_FUNCTION void operator()(IndexType i) const { + d_data(i) = i * 0.5; + } + + template + void parallel_scan() { + // Initialize data. + Kokkos::parallel_for(Kokkos::RangePolicy(0, N), *this); + + value_type total; + // Exclusive parallel_scan call + Kokkos::parallel_scan(Kokkos::RangePolicy(0, N), + FunctorType{d_data}, total); + + // Copy back the data. + auto h_data = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d_data); + + // Check Correctness + ASSERT_EQ(h_data(0), 0.0); + value_type upd = h_data(0); + for (int i = 1; i < N; ++i) { + upd += (i - 1) * 0.5; + ASSERT_EQ(h_data(i), upd); + } + ASSERT_EQ(total, N * (N - 1) * 0.25); + } +}; + TEST(TEST_CATEGORY, IncrTest_16_parallelscan) { TestScan test; test.parallel_scan>(); test.parallel_scan>(); + TestScanWithTotal test_total; + test_total.parallel_scan>(); + test_total.parallel_scan>(); + test_total.parallel_scan>(); } } // namespace Test From 1bc1a51947b2efaabd5f8f7b0a8f2c5fff5ff0e2 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 10 May 2023 11:36:35 -0400 Subject: [PATCH 435/496] Import sycl::bit_cast into the Kokkos namespace --- core/src/Kokkos_BitManipulation.hpp | 6 ++++++ core/unit_test/TestBitManipulation.cpp | 3 +++ 2 files changed, 9 insertions(+) diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index f1dd4c12e2..28755ed1a3 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -100,6 +100,11 @@ inline constexpr bool is_standard_unsigned_integer_type_v = namespace Kokkos { // +// FIXME_SYCL intel/llvm has unqualified calls to bit_cast which are ambiguous +// if we declare our own bit_cast function +#ifdef KOKKOS_ENABLE_SYCL +using sycl::bit_cast; +#else template KOKKOS_FUNCTION std::enable_if_t && @@ -110,6 +115,7 @@ bit_cast(From const& from) noexcept { memcpy(&to, &from, sizeof(To)); return to; } +#endif // // diff --git a/core/unit_test/TestBitManipulation.cpp b/core/unit_test/TestBitManipulation.cpp index 560c407b16..dccabed3db 100644 --- a/core/unit_test/TestBitManipulation.cpp +++ b/core/unit_test/TestBitManipulation.cpp @@ -500,6 +500,8 @@ constexpr X test_bit_cast(...) { return {}; } +// FIXME_SYCL The SYCL implementation is unconstrained +#ifndef KOKKOS_ENABLE_SYCL namespace TypesNotTheSameSize { struct To { char a; @@ -532,6 +534,7 @@ struct From { }; static_assert(test_bit_cast().did_not_match()); } // namespace FromNotTriviallyCopyable +#endif namespace ReturnTypeIllFormed { struct From { From 5f45c3086b9e33539d15e36971895b5df67fe52d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 10 May 2023 11:40:38 -0400 Subject: [PATCH 436/496] Qualify calls possibly ambiguous calls to bit_cast --- core/src/Kokkos_BitManipulation.hpp | 3 ++- simd/src/Kokkos_SIMD_AVX2.hpp | 4 ++-- simd/src/Kokkos_SIMD_AVX512.hpp | 6 ++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index f1dd4c12e2..a72af02413 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -398,7 +398,8 @@ KOKKOS_FUNCTION std::enable_if_t, To> bit_cast_builtin(From const& from) noexcept { - return bit_cast(from); // no benefit to call the _builtin variant + // qualify the call to avoid ADL + return Kokkos::bit_cast(from); // no benefit to call the _builtin variant } template diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index 8577ebede8..b499a8c6ef 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -814,8 +814,8 @@ class simd> { template , bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm256_set1_epi64x(bit_cast(value_type(value)))) { - } + : m_value(_mm256_set1_epi64x( + Kokkos::bit_cast(value_type(value)))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m256i const& value_in) : m_value(value_in) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( diff --git a/simd/src/Kokkos_SIMD_AVX512.hpp b/simd/src/Kokkos_SIMD_AVX512.hpp index 8da7120640..9ad6858f88 100644 --- a/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/simd/src/Kokkos_SIMD_AVX512.hpp @@ -262,7 +262,8 @@ class simd> { template , bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm256_set1_epi32(bit_cast(value_type(value)))) {} + : m_value(_mm256_set1_epi32( + Kokkos::bit_cast(value_type(value)))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( __m256i const& value_in) : m_value(value_in) {} @@ -486,7 +487,8 @@ class simd> { template , bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm512_set1_epi64(bit_cast(value_type(value)))) {} + : m_value(_mm512_set1_epi64( + Kokkos::bit_cast(value_type(value)))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m512i const& value_in) : m_value(value_in) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( From 6b2459ce68d257b6dbdb6964b93e39de68d39223 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 10 May 2023 14:15:00 -0400 Subject: [PATCH 437/496] Fix nightlies -- workaround compiler bug in GCC 9.1 and 9.2 (#6118) * Workaround compiler bug in GCC 9.1 and 9.2 * KOKKOS_COMPILER_GCC -> KOKKOS_COMPILER_GNU Co-authored-by: Daniel Arndt * Update guards to use the workaround for GCC 9.0.1 as well Co-authored-by: Nathan Ellingwood * Update comment to reflect more accurately what was done --------- Co-authored-by: Daniel Arndt Co-authored-by: Nathan Ellingwood --- core/unit_test/TestBitManipulationBuiltins.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp index 9adf22774a..9e353de9e7 100644 --- a/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/core/unit_test/TestBitManipulationBuiltins.hpp @@ -789,7 +789,14 @@ struct TestBitCastFunction { ASSERT_EQ(errors, 0) << "Failed check no error for bit_cast()"; } template - static KOKKOS_FUNCTION bool check(const From& from) { +#if defined(KOKKOS_COMPILER_GNU) && (900 <= KOKKOS_COMPILER_GNU) && \ + (KOKKOS_COMPILER_GNU < 930) + // workaround compiler bug seen in GCC 9.0.1 and GCC 9.2.0 + KOKKOS_FUNCTION bool check(const From& from) const +#else + static KOKKOS_FUNCTION bool check(const From& from) +#endif + { using Kokkos::Experimental::bit_cast_builtin; return bit_cast_builtin(bit_cast_builtin(from)) == from; } From e94b5dd363977e77f1d7d151eedd83dc7bc3512a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 10 May 2023 16:22:51 -0400 Subject: [PATCH 438/496] Kokkos_BitManipulation: KOKKOS_COMPILER_GCC->KOKKOS_COMPILER_GNU (#6119) * Kokkos_BitManipulation: KOKKOS_COMPILER_GCC->KOKKOS_COMPILER_GNU * Check for existence of _has_builtin --- core/src/Kokkos_BitManipulation.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index a72af02413..c376f08988 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -238,7 +238,7 @@ rotr(T x, int s) noexcept { namespace Kokkos::Impl { #if defined(KOKKOS_COMPILER_CLANG) || defined(KOKKOS_COMPILER_INTEL_LLVM) || \ - defined(KOKKOS_COMPILER_GCC) + defined(KOKKOS_COMPILER_GNU) #define KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS #endif @@ -259,12 +259,13 @@ KOKKOS_IMPL_HOST_FUNCTION T byteswap_builtin_host(T x) noexcept { } else if constexpr (sizeof(T) == 8) { return __builtin_bswap64(x); } else if constexpr (sizeof(T) == 16) { +#if defined(__has_builtin) #if __has_builtin(__builtin_bswap128) return __builtin_bswap128(x); -#else +#endif +#endif return (__builtin_bswap64(x >> 64) | (static_cast(__builtin_bswap64(x)) << 64)); -#endif } #endif From 0018848c662434d30fceb39799328d605f2de4d9 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 11 May 2023 16:33:57 -0400 Subject: [PATCH 439/496] Cuda: Remove unused attach_texture_object --- core/src/Cuda/Kokkos_CudaSpace.cpp | 46 ------------------------ core/src/Cuda/Kokkos_CudaSpace.hpp | 57 ------------------------------ 2 files changed, 103 deletions(-) diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp index 45c5ddaf2a..334c93d17e 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -423,49 +423,6 @@ SharedAllocationRecord SharedAllocationRecord::s_root_record; #endif -::cudaTextureObject_t -SharedAllocationRecord::attach_texture_object( - const unsigned sizeof_alias, void *const alloc_ptr, - size_t const alloc_size) { - enum { TEXTURE_BOUND_1D = 1u << 27 }; - - if ((alloc_ptr == nullptr) || - (sizeof_alias * TEXTURE_BOUND_1D <= alloc_size)) { - std::ostringstream msg; - msg << "Kokkos::CudaSpace ERROR: Cannot attach texture object to" - << " alloc_ptr(" << alloc_ptr << ")" - << " alloc_size(" << alloc_size << ")" - << " max_size(" << (sizeof_alias * TEXTURE_BOUND_1D) << ")"; - std::cerr << msg.str() << std::endl; - std::cerr.flush(); - Kokkos::Impl::throw_runtime_exception(msg.str()); - } - - ::cudaTextureObject_t tex_obj; - - struct cudaResourceDesc resDesc; - struct cudaTextureDesc texDesc; - - memset(&resDesc, 0, sizeof(resDesc)); - memset(&texDesc, 0, sizeof(texDesc)); - - resDesc.resType = cudaResourceTypeLinear; - resDesc.res.linear.desc = - (sizeof_alias == 4 - ? cudaCreateChannelDesc() - : (sizeof_alias == 8 - ? cudaCreateChannelDesc< ::int2>() - : - /* sizeof_alias == 16 */ cudaCreateChannelDesc< ::int4>())); - resDesc.res.linear.sizeInBytes = alloc_size; - resDesc.res.linear.devPtr = alloc_ptr; - - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaCreateTextureObject(&tex_obj, &resDesc, &texDesc, nullptr)); - - return tex_obj; -} - //============================================================================== // {{{1 @@ -524,7 +481,6 @@ SharedAllocationRecord::SharedAllocationRecord( arg_alloc_size), sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, arg_label), - m_tex_obj(0), m_space(arg_space) { SharedAllocationHeader header; @@ -555,7 +511,6 @@ SharedAllocationRecord::SharedAllocationRecord( arg_label, arg_alloc_size), sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, arg_label), - m_tex_obj(0), m_space(arg_space) { SharedAllocationHeader header; @@ -582,7 +537,6 @@ SharedAllocationRecord::SharedAllocationRecord( arg_alloc_size), sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, arg_label), - m_tex_obj(0), m_space(arg_space) { this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, arg_label); diff --git a/core/src/Cuda/Kokkos_CudaSpace.hpp b/core/src/Cuda/Kokkos_CudaSpace.hpp index eec9999f61..b8fa335cd3 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -534,15 +534,10 @@ class SharedAllocationRecord SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - static ::cudaTextureObject_t attach_texture_object( - const unsigned sizeof_alias, void* const alloc_ptr, - const size_t alloc_size); - #ifdef KOKKOS_ENABLE_DEBUG static RecordBase s_root_record; #endif - ::cudaTextureObject_t m_tex_obj = 0; const Kokkos::CudaSpace m_space; protected: @@ -566,7 +561,6 @@ class SharedAllocationRecord arg_alloc_size), sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, arg_label), - m_tex_obj(0), m_space(arg_space) { SharedAllocationHeader header; @@ -592,30 +586,6 @@ class SharedAllocationRecord // helper function to work around MSVC+NVCC issue // https://github.com/kokkos/kokkos/issues/5258 static void deep_copy_header_no_exec(void*, const void*); - - public: - template - inline ::cudaTextureObject_t attach_texture_object() { - static_assert((std::is_same::value || - std::is_same::value || - std::is_same::value), - "Cuda texture fetch only supported for alias types of int, " - "::int2, or ::int4"); - - if (m_tex_obj == 0) { - m_tex_obj = attach_texture_object(sizeof(AliasType), - (void*)RecordBase::m_alloc_ptr, - RecordBase::m_alloc_size); - } - - return m_tex_obj; - } - - template - inline int attach_texture_object_offset(const AliasType* const ptr) { - // Texture object is attached to the entire allocation range - return ptr - reinterpret_cast(RecordBase::m_alloc_ptr); - } }; template <> @@ -632,7 +602,6 @@ class SharedAllocationRecord static RecordBase s_root_record; - ::cudaTextureObject_t m_tex_obj = 0; const Kokkos::CudaUVMSpace m_space; protected: @@ -657,7 +626,6 @@ class SharedAllocationRecord arg_alloc_size), sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, arg_label), - m_tex_obj(0), m_space(arg_space) { this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, arg_label); @@ -667,31 +635,6 @@ class SharedAllocationRecord const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, const size_t arg_alloc_size, const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - public: - template - inline ::cudaTextureObject_t attach_texture_object() { - static_assert((std::is_same::value || - std::is_same::value || - std::is_same::value), - "Cuda texture fetch only supported for alias types of int, " - "::int2, or ::int4"); - - if (m_tex_obj == 0) { - m_tex_obj = SharedAllocationRecord:: - attach_texture_object(sizeof(AliasType), - (void*)RecordBase::m_alloc_ptr, - RecordBase::m_alloc_size); - } - - return m_tex_obj; - } - - template - inline int attach_texture_object_offset(const AliasType* const ptr) { - // Texture object is attached to the entire allocation range - return ptr - reinterpret_cast(RecordBase::m_alloc_ptr); - } }; template <> From 8a541b50bc016d73a6ebfb3409e458137449af77 Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Sun, 14 May 2023 08:43:50 -0600 Subject: [PATCH 440/496] Move half traits to private header and add half/bhalf infinity trait (#6055) * core/src: Move half traits to private header * core/src: Add half_t and bhalf_t infinity trait * Update core/src/impl/Kokkos_Half_NumericTraits.hpp Co-authored-by: Damien L-G --------- Co-authored-by: Damien L-G --- core/src/Kokkos_Half.hpp | 313 +----------------- core/src/impl/Kokkos_Half_NumericTraits.hpp | 349 ++++++++++++++++++++ 2 files changed, 350 insertions(+), 312 deletions(-) create mode 100644 core/src/impl/Kokkos_Half_NumericTraits.hpp diff --git a/core/src/Kokkos_Half.hpp b/core/src/Kokkos_Half.hpp index 48d1e647e6..179141220f 100644 --- a/core/src/Kokkos_Half.hpp +++ b/core/src/Kokkos_Half.hpp @@ -22,7 +22,6 @@ #endif #include -#include #include #include // istream & ostream for extraction and insertion ops @@ -1017,318 +1016,8 @@ cast_from_bhalf(bhalf_t val) { #else #define KOKKOS_BHALF_T_IS_FLOAT false #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED - ////////////// BEGIN HALF_T (binary16) limits ////////////// - // clang-format off -// '\brief:' below are from the libc definitions for float and double: -// https://www.gnu.org/software/libc/manual/html_node/Floating-Point-Parameters.html -// -// The arithmetic encoding and equations below are derived from: -// Ref1: https://en.wikipedia.org/wiki/Single-precision_floating-point_format -// Ref2: https://en.wikipedia.org/wiki/Exponent_bias -// Ref3; https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html -// -// Some background on the magic numbers 2**10=1024 and 2**15=32768 used below: -// -// IMPORTANT: For IEEE754 encodings, see Ref1. -// -// For binary16, we have B = 2 and p = 16 with 2**16 possible significands. -// The binary16 format is: [s e e e e e f f f f f f f f f f] -// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 -// s: signed bit (1 bit) -// e: exponent bits (5 bits) -// f: fractional bits (10 bits) -// -// E_bias = 2**(n_exponent_bits - 1) - 1 = 2**(5 - 1) - 1 = 15 -// E_subnormal = 00000 (base2) -// E_infinity = 11111 (base2) -// E_min = 1 - E_bias = 1 - 15 -// E_max = 2**5 - 1 - E_bias = 2**5 - 1 - 15 = 16 -// -// 2**10=1024 is the smallest denominator that is representable in binary16: -// [s e e e e e f f f f f f f f f f] -// [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1] -// which is: 1 / 2**-10 -// -// -// 2**15 is the largest exponent factor representable in binary16, for example the -// largest integer value representable in binary16 is: -// [s e e e e e f f f f f f f f f f] -// [0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1] -// which is: 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1)) = -// 2**15 * (1 + 0.9990234375) = -// 65504.0 -// - -/// \brief: Infinity. -/// -/// base2 encoding: bits [10,14] set -/// #define KOKKOS_IMPL_HALF_T_HUGE_VALH 0x7c00 -/// Binary16 encoding: -/// [s e e e e e f f f f f f f f f f] -/// [0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0] -/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 -#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT - -/// \brief: Minimum normalized number -/// -/// Stdc defines this as the smallest number (representable in binary16). -/// -/// Binary16 encoding: -/// [s e e e e e f f f f f f f f f f] -/// [1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1] -/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 -/// -/// and in base10: -1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1) -/// = -2**15 * (1 + (2**10 - 1) / 2**10) -template <> -struct Kokkos::Experimental::Impl::finite_min_helper< - Kokkos::Experimental::half_t> { - static constexpr float value = -65504.0F; -}; - -/// \brief: Maximum normalized number -/// -/// Stdc defines this as the maximum number (representable in binary16). -/// -/// Binary16 encoding: -/// [s e e e e e f f f f f f f f f f] -/// [0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1] -/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 -/// -/// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1) -/// = 2**15 * (1 + (2**10 - 1) / 2**10) -template <> -struct Kokkos::Experimental::Impl::finite_max_helper< - Kokkos::Experimental::half_t> { - static constexpr float value = 65504.0F; -}; - -/// \brief: This is the difference between 1 and the smallest floating point -/// number of type binary16 that is greater than 1 -/// -/// Smallest number in binary16 that is greater than 1 encoding: -/// [s e e e e e f f f f f f f f f f] -/// [0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1] -/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 -/// -/// and in base10: 1 * 2**(2**3 + 2**2 + 2**1 + 2**0 - 15) * (1 + 2**-10) -/// = 2**0 * (1 + 2**-10) -/// = 1.0009765625 -/// -/// Lastly, 1 - 1.0009765625 = 0.0009765625. -template <> -struct Kokkos::Experimental::Impl::epsilon_helper< - Kokkos::Experimental::half_t> { - static constexpr float value = 0.0009765625F; -}; - -/// @brief: The largest possible rounding error in ULPs -/// -/// This simply uses the maximum rounding error. -/// -/// Reference: https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#689 -template <> -struct Kokkos::Experimental::Impl::round_error_helper< - Kokkos::Experimental::half_t> { - static constexpr float value = 0.5F; -}; - -/// \brief: Minimum normalized positive half precision number -/// -/// Stdc defines this as the minimum normalized positive floating -/// point number that is representable in type binary16 -/// -/// Smallest number in binary16 that is greater than 1 encoding: -/// [s e e e e e f f f f f f f f f f] -/// [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0] -/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 -/// -/// and in base10: 1 * 2**(2**0 - 15) * (1) -/// = 2**-14 -template <> -struct Kokkos::Experimental::Impl::norm_min_helper< - Kokkos::Experimental::half_t> { - static constexpr float value = 0.00006103515625F; -}; - -/// \brief: Quiet not a half precision number -/// -/// IEEE 754 defines this as all exponent bits high. -/// -/// Quiet NaN in binary16: -/// [s e e e e e f f f f f f f f f f] -/// [1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0] -/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 -template <> -struct Kokkos::Experimental::Impl::quiet_NaN_helper< - Kokkos::Experimental::half_t> { - static constexpr float value = 0xfc000; -}; - -/// \brief: Signaling not a half precision number -/// -/// IEEE 754 defines this as all exponent bits and the first fraction bit high. -/// -/// Quiet NaN in binary16: -/// [s e e e e e f f f f f f f f f f] -/// [1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0] -/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 -template <> -struct Kokkos::Experimental::Impl::signaling_NaN_helper< - Kokkos::Experimental::half_t> { - static constexpr float value = 0xfe000; -}; -/// \brief: Number of digits in the matissa that can be represented -/// without losing precision. -/// -/// Stdc defines this as the number of base-RADIX digits in the floating point mantissa for the binary16 data type. -/// -/// In binary16, we have 10 fractional bits plus the implicit leading 1. -template <> -struct Kokkos::Experimental::Impl::digits_helper { - static constexpr int value = 11; -}; - -/// \brief: "The number of base-10 digits that can be represented by the type T without change" -/// Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/digits10. -/// -/// "For base-radix types, it is the value of digits() (digits - 1 for floating-point types) multiplied by log10(radix) and rounded down." -/// Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/digits10. -/// -/// This is: floor(11 - 1 * log10(2)) -template <> -struct Kokkos::Experimental::Impl::digits10_helper< - Kokkos::Experimental::half_t> { - static constexpr int value = 3; -}; - -/// \brief: Value of the base of the exponent representation. -/// -/// Stdc defined this as the value of the base, or radix, of the exponent representation. -template <> -struct Kokkos::Experimental::Impl::radix_helper { - static constexpr int value = 2; -}; - -/// \brief: This is the smallest possible exponent value -/// -/// Stdc defines this as the smallest possible exponent value for type binary16. -/// More precisely, it is the minimum negative integer such that the value min_exponent_helper -/// raised to this power minus 1 can be represented as a normalized floating point number of type float. -/// -/// In binary16: -/// [s e e e e e f f f f f f f f f f] -/// [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0] -/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 -/// -/// and in base10: 1 * 2**(2**0 - 15) * (1 + 0) -/// = 2**-14 -/// -/// with a bias of one from (C11 5.2.4.2.2), gives -13; -template <> -struct Kokkos::Experimental::Impl::min_exponent_helper< - Kokkos::Experimental::half_t> { - static constexpr int value = -13; -}; - -/// \brief: This is the largest possible exponent value -/// -/// In binary16: -/// [s e e e e e f f f f f f f f f f] -/// [0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0] -/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 -/// -/// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 0) -/// = 2**(30 - 15) -/// = 2**15 -/// -/// with a bias of one from (C11 5.2.4.2.2), gives 16; -template <> -struct Kokkos::Experimental::Impl::max_exponent_helper< - Kokkos::Experimental::half_t> { - static constexpr int value = 16; -}; -#endif -////////////// END HALF_T (binary16) limits ////////////// - -////////////// BEGIN BHALF_T (bfloat16) limits ////////////// -#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT -// Minimum normalized number -template <> -struct Kokkos::Experimental::Impl::finite_min_helper< - Kokkos::Experimental::bhalf_t> { - static constexpr float value = -3.38953139e38; -}; -// Maximum normalized number -template <> -struct Kokkos::Experimental::Impl::finite_max_helper< - Kokkos::Experimental::bhalf_t> { - static constexpr float value = 3.38953139e38; -}; -// 1/2^7 -template <> -struct Kokkos::Experimental::Impl::epsilon_helper< - Kokkos::Experimental::bhalf_t> { - static constexpr float value = 0.0078125F; -}; -template <> -struct Kokkos::Experimental::Impl::round_error_helper< - Kokkos::Experimental::bhalf_t> { - static constexpr float value = 0.5F; -}; -// Minimum normalized positive bhalf number -template <> -struct Kokkos::Experimental::Impl::norm_min_helper< - Kokkos::Experimental::bhalf_t> { - static constexpr float value = 1.1754494351e-38; -}; -// Quiet not a bhalf number -template <> -struct Kokkos::Experimental::Impl::quiet_NaN_helper< - Kokkos::Experimental::bhalf_t> { - static constexpr float value = 0x7fc000; -}; -// Signaling not a bhalf number -template <> -struct Kokkos::Experimental::Impl::signaling_NaN_helper< - Kokkos::Experimental::bhalf_t> { - static constexpr float value = 0x7fe000; -}; -// Number of digits in the matissa that can be represented -// without losing precision. -template <> -struct Kokkos::Experimental::Impl::digits_helper< - Kokkos::Experimental::bhalf_t> { - static constexpr int value = 2; -}; -// 7 - 1 * log10(2) -template <> -struct Kokkos::Experimental::Impl::digits10_helper< - Kokkos::Experimental::bhalf_t> { - static constexpr int value = 1; -}; -// Value of the base of the exponent representation. -template <> -struct Kokkos::Experimental::Impl::radix_helper { - static constexpr int value = 2; -}; -// This is the smallest possible exponent value -// with a bias of one (C11 5.2.4.2.2). -template <> -struct Kokkos::Experimental::Impl::min_exponent_helper< - Kokkos::Experimental::bhalf_t> { - static constexpr int value = -125; -}; -// This is the largest possible exponent value -// with a bias of one (C11 5.2.4.2.2). -template <> -struct Kokkos::Experimental::Impl::max_exponent_helper< - Kokkos::Experimental::bhalf_t> { - static constexpr int value = 128; -}; -#endif -////////////// END BHALF_T (bfloat16) limits ////////////// +#include #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/core/src/impl/Kokkos_Half_NumericTraits.hpp b/core/src/impl/Kokkos_Half_NumericTraits.hpp new file mode 100644 index 0000000000..b5cbf22194 --- /dev/null +++ b/core/src/impl/Kokkos_Half_NumericTraits.hpp @@ -0,0 +1,349 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HALF_NUMERIC_TRAITS_HPP_ +#define KOKKOS_HALF_NUMERIC_TRAITS_HPP_ + +#include + +////////////// BEGIN HALF_T (binary16) limits ////////////// +// clang-format off +// '\brief:' below are from the libc definitions for float and double: +// https://www.gnu.org/software/libc/manual/html_node/Floating-Point-Parameters.html +// +// The arithmetic encoding and equations below are derived from: +// Ref1: https://en.wikipedia.org/wiki/Single-precision_floating-point_format +// Ref2: https://en.wikipedia.org/wiki/Exponent_bias +// Ref3; https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html +// +// Some background on the magic numbers 2**10=1024 and 2**15=32768 used below: +// +// IMPORTANT: For IEEE754 encodings, see Ref1. +// +// For binary16, we have B = 2 and p = 16 with 2**16 possible significands. +// The binary16 format is: [s e e e e e f f f f f f f f f f] +// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +// s: signed bit (1 bit) +// e: exponent bits (5 bits) +// f: fractional bits (10 bits) +// +// E_bias = 2**(n_exponent_bits - 1) - 1 = 2**(5 - 1) - 1 = 15 +// E_subnormal = 00000 (base2) +// E_infinity = 11111 (base2) +// E_min = 1 - E_bias = 1 - 15 +// E_max = 2**5 - 1 - E_bias = 2**5 - 1 - 15 = 16 +// +// 2**10=1024 is the smallest denominator that is representable in binary16: +// [s e e e e e f f f f f f f f f f] +// [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1] +// which is: 1 / 2**-10 +// +// +// 2**15 is the largest exponent factor representable in binary16, for example the +// largest integer value representable in binary16 is: +// [s e e e e e f f f f f f f f f f] +// [0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1] +// which is: 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1)) = +// 2**15 * (1 + 0.9990234375) = +// 65504.0 +// +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +/// \brief: Infinity +/// +/// Binary16 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +template <> +struct Kokkos::Experimental::Impl::infinity_helper { + static constexpr int value = 0x7C00; +}; + +/// \brief: Minimum normalized number +/// +/// Stdc defines this as the smallest number (representable in binary16). +/// +/// Binary16 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: -1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1) +/// = -2**15 * (1 + (2**10 - 1) / 2**10) +template <> +struct Kokkos::Experimental::Impl::finite_min_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = -65504.0F; +}; + +/// \brief: Maximum normalized number +/// +/// Stdc defines this as the maximum number (representable in binary16). +/// +/// Binary16 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1) +/// = 2**15 * (1 + (2**10 - 1) / 2**10) +template <> +struct Kokkos::Experimental::Impl::finite_max_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 65504.0F; +}; + +/// \brief: This is the difference between 1 and the smallest floating point +/// number of type binary16 that is greater than 1 +/// +/// Smallest number in binary16 that is greater than 1 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**3 + 2**2 + 2**1 + 2**0 - 15) * (1 + 2**-10) +/// = 2**0 * (1 + 2**-10) +/// = 1.0009765625 +/// +/// Lastly, 1 - 1.0009765625 = 0.0009765625. +template <> +struct Kokkos::Experimental::Impl::epsilon_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0.0009765625F; +}; + +/// @brief: The largest possible rounding error in ULPs +/// +/// This simply uses the maximum rounding error. +/// +/// Reference: https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#689 +template <> +struct Kokkos::Experimental::Impl::round_error_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0.5F; +}; + +/// \brief: Minimum normalized positive half precision number +/// +/// Stdc defines this as the minimum normalized positive floating +/// point number that is representable in type binary16 +/// +/// Smallest number in binary16 that is greater than 1 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**0 - 15) * (1) +/// = 2**-14 +template <> +struct Kokkos::Experimental::Impl::norm_min_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0.00006103515625F; +}; + +/// \brief: Quiet not a half precision number +/// +/// IEEE 754 defines this as all exponent bits high. +/// +/// Quiet NaN in binary16: +/// [s e e e e e f f f f f f f f f f] +/// [1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +template <> +struct Kokkos::Experimental::Impl::quiet_NaN_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0xfc000; +}; + +/// \brief: Signaling not a half precision number +/// +/// IEEE 754 defines this as all exponent bits and the first fraction bit high. +/// +/// Quiet NaN in binary16: +/// [s e e e e e f f f f f f f f f f] +/// [1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +template <> +struct Kokkos::Experimental::Impl::signaling_NaN_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0xfe000; +}; + +/// \brief: Number of digits in the matissa that can be represented +/// without losing precision. +/// +/// Stdc defines this as the number of base-RADIX digits in the floating point mantissa for the binary16 data type. +/// +/// In binary16, we have 10 fractional bits plus the implicit leading 1. +template <> +struct Kokkos::Experimental::Impl::digits_helper { + static constexpr int value = 11; +}; + +/// \brief: "The number of base-10 digits that can be represented by the type T without change" +/// Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/digits10. +/// +/// "For base-radix types, it is the value of digits() (digits - 1 for floating-point types) multiplied by log10(radix) and rounded down." +/// Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/digits10. +/// +/// This is: floor(11 - 1 * log10(2)) +template <> +struct Kokkos::Experimental::Impl::digits10_helper< + Kokkos::Experimental::half_t> { + static constexpr int value = 3; +}; + +/// \brief: Value of the base of the exponent representation. +/// +/// Stdc defined this as the value of the base, or radix, of the exponent representation. +template <> +struct Kokkos::Experimental::Impl::radix_helper { + static constexpr int value = 2; +}; + +/// \brief: This is the smallest possible exponent value +/// +/// Stdc defines this as the smallest possible exponent value for type binary16. +/// More precisely, it is the minimum negative integer such that the value min_exponent_helper +/// raised to this power minus 1 can be represented as a normalized floating point number of type float. +/// +/// In binary16: +/// [s e e e e e f f f f f f f f f f] +/// [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**0 - 15) * (1 + 0) +/// = 2**-14 +/// +/// with a bias of one from (C11 5.2.4.2.2), gives -13; +template <> +struct Kokkos::Experimental::Impl::min_exponent_helper< + Kokkos::Experimental::half_t> { + static constexpr int value = -13; +}; + +/// \brief: This is the largest possible exponent value +/// +/// In binary16: +/// [s e e e e e f f f f f f f f f f] +/// [0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 0) +/// = 2**(30 - 15) +/// = 2**15 +/// +/// with a bias of one from (C11 5.2.4.2.2), gives 16; +template <> +struct Kokkos::Experimental::Impl::max_exponent_helper< + Kokkos::Experimental::half_t> { + static constexpr int value = 16; +}; +#endif +////////////// END HALF_T (binary16) limits ////////////// + +////////////// BEGIN BHALF_T (bfloat16) limits ////////////// +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +/// \brief: Infinity +/// +/// Bfloat16 encoding: +/// [s e e e e e e e e f f f f f f f] +/// [0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +template <> +struct Kokkos::Experimental::Impl::infinity_helper { + static constexpr int value = 0x7F80; +}; + +// Minimum normalized number +template <> +struct Kokkos::Experimental::Impl::finite_min_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = -3.38953139e38; +}; +// Maximum normalized number +template <> +struct Kokkos::Experimental::Impl::finite_max_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 3.38953139e38; +}; +// 1/2^7 +template <> +struct Kokkos::Experimental::Impl::epsilon_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 0.0078125F; +}; +template <> +struct Kokkos::Experimental::Impl::round_error_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 0.5F; +}; +// Minimum normalized positive bhalf number +template <> +struct Kokkos::Experimental::Impl::norm_min_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 1.1754494351e-38; +}; +// Quiet not a bhalf number +template <> +struct Kokkos::Experimental::Impl::quiet_NaN_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 0x7fc000; +}; +// Signaling not a bhalf number +template <> +struct Kokkos::Experimental::Impl::signaling_NaN_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 0x7fe000; +}; +// Number of digits in the matissa that can be represented +// without losing precision. +template <> +struct Kokkos::Experimental::Impl::digits_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr int value = 2; +}; +// 7 - 1 * log10(2) +template <> +struct Kokkos::Experimental::Impl::digits10_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr int value = 1; +}; +// Value of the base of the exponent representation. +template <> +struct Kokkos::Experimental::Impl::radix_helper { + static constexpr int value = 2; +}; +// This is the smallest possible exponent value +// with a bias of one (C11 5.2.4.2.2). +template <> +struct Kokkos::Experimental::Impl::min_exponent_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr int value = -125; +}; +// This is the largest possible exponent value +// with a bias of one (C11 5.2.4.2.2). +template <> +struct Kokkos::Experimental::Impl::max_exponent_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr int value = 128; +}; +#endif +////////////// END BHALF_T (bfloat16) limits ////////// + +#endif // KOKKOS_HALF_NUMERIC_TRAITS_HPP_ From 12b0c802198fcc8bc83ff63fafc8a2a72699d10e Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 15 May 2023 12:42:32 +0200 Subject: [PATCH 441/496] Increase minimum required HPX version to 1.8.0 --- cmake/Modules/FindTPLHPX.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Modules/FindTPLHPX.cmake b/cmake/Modules/FindTPLHPX.cmake index 5636a9bb66..d7b54fb9c9 100644 --- a/cmake/Modules/FindTPLHPX.cmake +++ b/cmake/Modules/FindTPLHPX.cmake @@ -1,5 +1,5 @@ -FIND_PACKAGE(HPX REQUIRED 1.7.0) +FIND_PACKAGE(HPX REQUIRED 1.8.0) #as of right now, HPX doesn't export correctly #so let's convert it to an interface target KOKKOS_CREATE_IMPORTED_TPL(HPX INTERFACE From d13cc09eea0bdd4b9143d00850171ee0074f059c Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 15 May 2023 12:49:29 +0200 Subject: [PATCH 442/496] Conditionally use hpx::post instead of hpx::apply based on HPX version --- core/src/HPX/Kokkos_HPX.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/src/HPX/Kokkos_HPX.cpp b/core/src/HPX/Kokkos_HPX.cpp index 19978c7cf3..4298127e13 100644 --- a/core/src/HPX/Kokkos_HPX.cpp +++ b/core/src/HPX/Kokkos_HPX.cpp @@ -210,7 +210,11 @@ void HPX::impl_finalize() { if (m_hpx_initialized) { hpx::runtime *rt = hpx::get_runtime_ptr(); if (rt != nullptr) { +#if HPX_VERSION_FULL >= 0x010900 + hpx::post([]() { hpx::local::finalize(); }); +#else hpx::apply([]() { hpx::local::finalize(); }); +#endif hpx::local::stop(); } else { Kokkos::abort( From df5681d1928d96acc8b0265532e1c3e0de628031 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 15 May 2023 15:36:17 -0400 Subject: [PATCH 443/496] Don't restrict index type in builtin reducers --- core/src/Kokkos_Parallel_Reduce.hpp | 3 - core/unit_test/TestReducers.hpp | 212 ++++++++++++++++++++++++++++ 2 files changed, 212 insertions(+), 3 deletions(-) diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index 17f8bf1817..0024973829 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -416,7 +416,6 @@ struct MinLoc { using index_type = std::remove_cv_t; static_assert(!std::is_pointer_v && !std::is_array_v); - static_assert(std::is_integral_v); public: // Required @@ -472,7 +471,6 @@ struct MaxLoc { using index_type = std::remove_cv_t; static_assert(!std::is_pointer_v && !std::is_array_v); - static_assert(std::is_integral_v); public: // Required @@ -597,7 +595,6 @@ struct MinMaxLoc { using index_type = std::remove_cv_t; static_assert(!std::is_pointer_v && !std::is_array_v); - static_assert(std::is_integral_v); public: // Required diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp index 621cb28c9e..b7727e1c09 100644 --- a/core/unit_test/TestReducers.hpp +++ b/core/unit_test/TestReducers.hpp @@ -22,6 +22,17 @@ //-------------------------------------------------------------------------- +namespace Test { +struct MyPair : Kokkos::pair {}; +} // namespace Test + +template <> +struct Kokkos::reduction_identity { + KOKKOS_FUNCTION static Test::MyPair min() { + return Test::MyPair{{INT_MAX, INT_MAX}}; + } +}; + namespace Test { struct ReducerTag {}; @@ -74,6 +85,20 @@ struct TestReducers { } }; + struct MinLocFunctor2D { + Kokkos::View values; + + KOKKOS_INLINE_FUNCTION + void operator()( + const int& i, const int& j, + typename Kokkos::MinLoc::value_type& value) const { + if (values(i, j) < value.val) { + value.val = values(i, j); + value.loc = {{i, j}}; + } + } + }; + struct MaxLocFunctor { Kokkos::View values; @@ -88,6 +113,20 @@ struct TestReducers { } }; + struct MaxLocFunctor2D { + Kokkos::View values; + + KOKKOS_INLINE_FUNCTION + void operator()( + const int& i, const int& j, + typename Kokkos::MaxLoc::value_type& value) const { + if (values(i, j) > value.val) { + value.val = values(i, j); + value.loc = {{i, j}}; + } + } + }; + struct MinMaxLocFunctor { Kokkos::View values; @@ -107,6 +146,25 @@ struct TestReducers { } }; + struct MinMaxLocFunctor2D { + Kokkos::View values; + + KOKKOS_INLINE_FUNCTION + void operator()( + const int& i, const int& j, + typename Kokkos::MinMaxLoc::value_type& value) const { + if (values(i, j) > value.max_val) { + value.max_val = values(i, j); + value.max_loc = {{i, j}}; + } + + if (values(i, j) < value.min_val) { + value.min_val = values(i, j); + value.min_loc = {{i, j}}; + } + } + }; + struct BAndFunctor { Kokkos::View values; @@ -598,6 +656,44 @@ struct TestReducers { } } + static void test_minloc_2d(int N) { + using reducer_type = Kokkos::MinLoc; + using value_type = typename reducer_type::value_type; + + Kokkos::View values("Values", N, N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_min = std::numeric_limits::max(); + MyPair reference_loc = {{-1, -1}}; + + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) { + h_values(i, j) = (Scalar)(rand() % 100000 + 2); + + if (h_values(i, j) < reference_min) { + reference_min = h_values(i, j); + reference_loc = {{i, j}}; + } else if (h_values(i, j) == reference_min) { + // Make min unique. + h_values(i, j) += Scalar(1); + } + } + Kokkos::deep_copy(values, h_values); + + MinLocFunctor2D f; + f.values = values; + + { + value_type min_scalar; + reducer_type reducer_scalar(min_scalar); + + Kokkos::parallel_reduce( + Kokkos::MDRangePolicy, ExecSpace>({0, 0}, {N, N}), f, + reducer_scalar); + ASSERT_EQ(min_scalar.val, reference_min); + ASSERT_EQ(min_scalar.loc, reference_loc); + } + } + static void test_maxloc(int N) { using value_type = typename Kokkos::MaxLoc::value_type; @@ -661,6 +757,44 @@ struct TestReducers { } } + static void test_maxloc_2d(int N) { + using reducer_type = Kokkos::MaxLoc; + using value_type = typename reducer_type::value_type; + + Kokkos::View values("Values", N, N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_max = std::numeric_limits::min(); + MyPair reference_loc = {{-1, -1}}; + + for (int i = 0; i < N; ++i) + for (int j = 0; j < N; ++j) { + h_values(i, j) = (Scalar)(rand() % 100000 + 2); + + if (h_values(i, j) > reference_max) { + reference_max = h_values(i, j); + reference_loc = {{i, j}}; + } else if (h_values(i, j) == reference_max) { + // Make max unique. + h_values(i, j) -= Scalar(1); + } + } + Kokkos::deep_copy(values, h_values); + + MaxLocFunctor2D f; + f.values = values; + + { + value_type max_scalar; + reducer_type reducer_scalar(max_scalar); + + Kokkos::parallel_reduce( + Kokkos::MDRangePolicy, ExecSpace>({0, 0}, {N, N}), f, + reducer_scalar); + ASSERT_EQ(max_scalar.val, reference_max); + ASSERT_EQ(max_scalar.loc, reference_loc); + } + } + static void test_minmaxloc(int N) { using value_type = typename Kokkos::MinMaxLoc::value_type; @@ -777,6 +911,78 @@ struct TestReducers { } } + static void test_minmaxloc_2d(int N) { + using reducer_type = Kokkos::MinMaxLoc; + using value_type = typename reducer_type::value_type; + + Kokkos::View values("Values", N, N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_max = std::numeric_limits::min(); + Scalar reference_min = std::numeric_limits::max(); + MyPair reference_minloc = {{-1, -1}}; + MyPair reference_maxloc = {{-1, -1}}; + + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) { + h_values(i, j) = (Scalar)(rand() % 100000 + 2); + } + + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) { + if (h_values(i, j) > reference_max) { + reference_max = h_values(i, j); + reference_maxloc = {{i, j}}; + } else if (h_values(i, j) == reference_max) { + // Make max unique. + h_values(i, j) -= Scalar(1); + } + } + + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) { + if (h_values(i, j) < reference_min) { + reference_min = h_values(i, j); + reference_minloc = {{i, j}}; + } else if (h_values(i, j) == reference_min) { + // Make min unique. + h_values(i, j) += Scalar(1); + } + } + + Kokkos::deep_copy(values, h_values); + + MinMaxLocFunctor2D f; + f.values = values; + { + value_type minmax_scalar; + reducer_type reducer_scalar(minmax_scalar); + + Kokkos::parallel_reduce( + Kokkos::MDRangePolicy, ExecSpace>({0, 0}, {N, N}), f, + reducer_scalar); + + ASSERT_EQ(minmax_scalar.min_val, reference_min); + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) { + if ((minmax_scalar.min_loc == MyPair{{i, j}}) && + (h_values(i, j) == reference_min)) { + reference_minloc = {{i, j}}; + } + } + ASSERT_EQ(minmax_scalar.min_loc, reference_minloc); + + ASSERT_EQ(minmax_scalar.max_val, reference_max); + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) { + if ((minmax_scalar.max_loc == MyPair{{i, j}}) && + (h_values(i, j) == reference_max)) { + reference_maxloc = {{i, j}}; + } + } + ASSERT_EQ(minmax_scalar.max_loc, reference_maxloc); + } + } + static void test_BAnd(int N) { Kokkos::View values("Values", N); auto h_values = Kokkos::create_mirror_view(values); @@ -985,11 +1191,13 @@ struct TestReducers { #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); + test_minloc_2d(100); #endif test_max(10007); #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); + test_maxloc_2d(100); #endif #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \ (KOKKOS_COMPILER_CLANG < 1300) @@ -998,6 +1206,7 @@ struct TestReducers { #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minmaxloc(10007); + test_minmaxloc_2d(100); #endif #endif } @@ -1012,11 +1221,13 @@ struct TestReducers { #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); + test_minloc_2d(100); #endif test_max(10007); #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); + test_maxloc_2d(100); #endif #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \ (KOKKOS_COMPILER_CLANG < 1300) @@ -1025,6 +1236,7 @@ struct TestReducers { #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minmaxloc(10007); + test_minmaxloc_2d(100); #endif #endif test_BAnd(35); From 1767bfe3e584377f1470e08b40f62ff7f5cf85cc Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Wed, 10 May 2023 13:18:58 +0000 Subject: [PATCH 444/496] dual view: update template types (#6085) --- containers/src/Kokkos_DualView.hpp | 81 +++++++++++++++++------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/containers/src/Kokkos_DualView.hpp b/containers/src/Kokkos_DualView.hpp index bef2149f4c..d2a7ad66b7 100644 --- a/containers/src/Kokkos_DualView.hpp +++ b/containers/src/Kokkos_DualView.hpp @@ -86,22 +86,37 @@ inline const Kokkos::Cuda& get_cuda_space(const NonCudaExecSpace&) { #endif // KOKKOS_ENABLE_CUDA } // namespace Impl -template -class DualView : public ViewTraits { - template + +template +class DualView; + +template +struct is_dual_view : public std::false_type {}; + +template +struct is_dual_view> : public std::true_type {}; + +template +struct is_dual_view> : public std::true_type {}; + +template +inline constexpr bool is_dual_view_v = is_dual_view::value; + +template +class DualView : public ViewTraits { + template friend class DualView; public: //! \name Typedefs for device types and various Kokkos::View specializations. //@{ - using traits = ViewTraits; + using traits = ViewTraits; //! The Kokkos Host Device type; using host_mirror_space = typename traits::host_mirror_space; //! The type of a Kokkos::View on the device. - using t_dev = View; + using t_dev = View; /// \typedef t_host /// \brief The type of a Kokkos::View host mirror of \c t_dev. @@ -109,8 +124,7 @@ class DualView : public ViewTraits { //! The type of a const View on the device. //! The type of a Kokkos::View on the device. - using t_dev_const = - View; + using t_dev_const = View; /// \typedef t_host_const /// \brief The type of a const View host mirror of \c t_dev_const. @@ -247,15 +261,15 @@ class DualView : public ViewTraits { } //! Copy constructor (shallow copy) - template - DualView(const DualView& src) + template + DualView(const DualView& src) : modified_flags(src.modified_flags), d_view(src.d_view), h_view(src.h_view) {} //! Subview constructor - template - DualView(const DualView& src, const Arg0& arg0, Args... args) + template + DualView(const DualView& src, const Arg0& arg0, Args... args) : modified_flags(src.modified_flags), d_view(Kokkos::subview(src.d_view, arg0, args...)), h_view(Kokkos::subview(src.h_view, arg0, args...)) {} @@ -1142,23 +1156,24 @@ class DualView : public ViewTraits { namespace Kokkos { namespace Impl { -template -struct DualViewSubview { - using dst_traits = typename Kokkos::Impl::ViewMapping< - void, Kokkos::ViewTraits, Args...>::traits_type; +template +struct V2DV; - using type = Kokkos::DualView< - typename dst_traits::data_type, typename dst_traits::array_layout, - typename dst_traits::device_type, typename dst_traits::memory_traits>; +template +struct V2DV> { + using type = DualView; }; - } /* namespace Impl */ -template -typename Impl::DualViewSubview::type subview( - const DualView& src, Args... args) { - return typename Impl::DualViewSubview::type(src, - args...); +template +auto subview(const DualView& src, Args&&... args) { + // leverage Kokkos::View facilities to deduce the properties of the subview + using deduce_subview_type = + decltype(subview(std::declval>(), + std::forward(args)...)); + // map it back to dual view + return typename Impl::V2DV::type( + src, std::forward(args)...); } } /* namespace Kokkos */ @@ -1172,11 +1187,8 @@ namespace Kokkos { // Partial specialization of Kokkos::deep_copy() for DualView objects. // -template -void deep_copy( - DualView dst, // trust me, this must not be a reference - const DualView& src) { +template +void deep_copy(DualView& dst, const DualView& src) { if (src.need_sync_device()) { deep_copy(dst.h_view, src.h_view); dst.modify_host(); @@ -1186,12 +1198,9 @@ void deep_copy( } } -template -void deep_copy( - const ExecutionSpace& exec, - DualView dst, // trust me, this must not be a reference - const DualView& src) { +template +void deep_copy(const ExecutionSpace& exec, DualView& dst, + const DualView& src) { if (src.need_sync_device()) { deep_copy(exec, dst.h_view, src.h_view); dst.modify_host(); From b86d73a2b84c3856043539842d5c0f1169663c47 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Tue, 16 May 2023 07:34:06 -0400 Subject: [PATCH 445/496] sorting an empty view should exit early and not fail (#6130) * sorting an empty view should not fail * fix binsort too * fix unused * fix test * add comment * address review comments * add fences * address comments * address comments * remove unused --- algorithms/src/Kokkos_Sort.hpp | 33 ++++++++++++++++- algorithms/unit_tests/TestSort.hpp | 45 ++++++++++++++++++++++++ algorithms/unit_tests/TestSortCommon.hpp | 12 +++++++ 3 files changed, 89 insertions(+), 1 deletion(-) diff --git a/algorithms/src/Kokkos_Sort.hpp b/algorithms/src/Kokkos_Sort.hpp index fcfe5d95c3..1486df6f66 100644 --- a/algorithms/src/Kokkos_Sort.hpp +++ b/algorithms/src/Kokkos_Sort.hpp @@ -329,6 +329,10 @@ class BinSort { template void sort(const ExecutionSpace& exec, ValuesViewType const& values, int values_range_begin, int values_range_end) const { + if (values.extent(0) == 0) { + return; + } + static_assert( Kokkos::SpaceAccessibility::accessible, @@ -606,6 +610,10 @@ std::enable_if_t<(Kokkos::is_execution_space::value) && memory_space>::accessible)> sort(const ExecutionSpace& exec, const Kokkos::View& view) { + if (view.extent(0) == 0) { + return; + } + using ViewType = Kokkos::View; using CompType = BinOp1D; @@ -648,8 +656,11 @@ sort(const ExecutionSpace& exec, template void sort(const Experimental::SYCL& space, const Kokkos::View& view) { - using ViewType = Kokkos::View; + if (view.extent(0) == 0) { + return; + } + using ViewType = Kokkos::View; static_assert(SpaceAccessibility::accessible, "SYCL execution space is not able to access the memory space " @@ -676,6 +687,9 @@ std::enable_if_t<(Kokkos::is_execution_space::value) && HostSpace, typename Kokkos::View:: memory_space>::accessible)> sort(const ExecutionSpace&, const Kokkos::View& view) { + if (view.extent(0) == 0) { + return; + } auto first = Experimental::begin(view); auto last = Experimental::end(view); std::sort(first, last); @@ -685,6 +699,9 @@ sort(const ExecutionSpace&, const Kokkos::View& view) { template void sort(const Cuda& space, const Kokkos::View& view) { + if (view.extent(0) == 0) { + return; + } const auto exec = thrust::cuda::par.on(space.cuda_stream()); auto first = Experimental::begin(view); auto last = Experimental::end(view); @@ -695,6 +712,11 @@ void sort(const Cuda& space, template void sort(ViewType const& view) { Kokkos::fence("Kokkos::sort: before"); + + if (view.extent(0) == 0) { + return; + } + typename ViewType::execution_space exec; sort(exec, view); exec.fence("Kokkos::sort: fence after sorting"); @@ -704,6 +726,10 @@ template std::enable_if_t::value> sort( const ExecutionSpace& exec, ViewType view, size_t const begin, size_t const end) { + if (view.extent(0) == 0) { + return; + } + using range_policy = Kokkos::RangePolicy; using CompType = BinOp1D; @@ -726,6 +752,11 @@ std::enable_if_t::value> sort( template void sort(ViewType view, size_t const begin, size_t const end) { Kokkos::fence("Kokkos::sort: before"); + + if (view.extent(0) == 0) { + return; + } + typename ViewType::execution_space exec; sort(exec, view, begin, end); exec.fence("Kokkos::Sort: fence after sorting"); diff --git a/algorithms/unit_tests/TestSort.hpp b/algorithms/unit_tests/TestSort.hpp index d903888878..9ac606c535 100644 --- a/algorithms/unit_tests/TestSort.hpp +++ b/algorithms/unit_tests/TestSort.hpp @@ -425,6 +425,51 @@ void test_sort(unsigned int N) { test_sort_integer_overflow(); test_sort_integer_overflow(); } + +template +void test_sort_empty_view() { + // does not matter if we use int or something else + Kokkos::View v("v", 0); + + // TODO check the synchronous behavior of the calls below + ASSERT_NO_THROW(Kokkos::sort(ExecutionSpace(), v)); + ASSERT_NO_THROW(Kokkos::sort(v)); +} + +template +void test_binsort_empty_view() { + // the bounds and extents used below are totally arbitrary + // and, in theory, should have no impact + + using KeyViewType = Kokkos::View; + KeyViewType kv("kv", 20); + + using BinOp_t = Kokkos::BinOp1D; + BinOp_t binOp(5, 0, 10); + Kokkos::BinSort Sorter(ExecutionSpace{}, kv, binOp); + + // does not matter if we use int or something else + Kokkos::View v("v", 0); + + // test all exposed public sort methods + ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v, 0, 0)); + ASSERT_NO_THROW(Sorter.sort(v, 0, 0)); + ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v)); + ASSERT_NO_THROW(Sorter.sort(v)); +} + +template +void test_binsort_empty_keys() { + using KeyViewType = Kokkos::View; + KeyViewType kv("kv", 0); + + using BinOp_t = Kokkos::BinOp1D; + BinOp_t binOp(5, 0, 10); + Kokkos::BinSort Sorter(ExecutionSpace{}, kv, binOp); + + ASSERT_NO_THROW(Sorter.create_permute_vector(ExecutionSpace{})); +} + } // namespace Impl } // namespace Test #endif /* KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_HPP */ diff --git a/algorithms/unit_tests/TestSortCommon.hpp b/algorithms/unit_tests/TestSortCommon.hpp index b8e2e17e4f..628ce905ac 100644 --- a/algorithms/unit_tests/TestSortCommon.hpp +++ b/algorithms/unit_tests/TestSortCommon.hpp @@ -23,5 +23,17 @@ namespace Test { TEST(TEST_CATEGORY, SortUnsigned) { Impl::test_sort(171); } + +TEST(TEST_CATEGORY, SortEmptyView) { + Impl::test_sort_empty_view(); +} + +TEST(TEST_CATEGORY, BinSortEmptyView) { + Impl::test_binsort_empty_view(); +} + +TEST(TEST_CATEGORY, BinSortEmptyKeys) { + Impl::test_binsort_empty_keys(); +} } // namespace Test #endif From 02fb8d423d1d0eb5a05f99ec04fb085a21facf92 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 May 2023 13:07:02 -0600 Subject: [PATCH 446/496] core/src: Move floating_point_wrapper to private header --- core/src/Kokkos_Half.hpp | 997 +--------------- .../impl/Kokkos_Half_FloatingPointWrapper.hpp | 1016 +++++++++++++++++ 2 files changed, 1017 insertions(+), 996 deletions(-) create mode 100644 core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp diff --git a/core/src/Kokkos_Half.hpp b/core/src/Kokkos_Half.hpp index 179141220f..91b94b4cfa 100644 --- a/core/src/Kokkos_Half.hpp +++ b/core/src/Kokkos_Half.hpp @@ -21,1002 +21,7 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF #endif -#include - -#include -#include // istream & ostream for extraction and insertion ops -#include - -#ifdef KOKKOS_IMPL_HALF_TYPE_DEFINED - -// KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH: A macro to select which -// floating_pointer_wrapper operator paths should be used. For CUDA, let the -// compiler conditionally select when device ops are used For SYCL, we have a -// full half type on both host and device -#if defined(__CUDA_ARCH__) || defined(KOKKOS_ENABLE_SYCL) -#define KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH -#endif - -/************************* BEGIN forward declarations *************************/ -namespace Kokkos { -namespace Experimental { -namespace Impl { -template -class floating_point_wrapper; -} - -// Declare half_t (binary16) -using half_t = Kokkos::Experimental::Impl::floating_point_wrapper< - Kokkos::Impl::half_impl_t ::type>; -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(float val); -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(bool val); -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(double val); -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(short val); -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(int val); -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(long val); -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(long long val); -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(unsigned short val); -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(unsigned int val); -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(unsigned long val); -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(unsigned long long val); -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(half_t); - -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_half(half_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_half(half_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_half(half_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_half(half_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_half(half_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_half(half_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_half(half_t); -template -KOKKOS_INLINE_FUNCTION - std::enable_if_t::value, T> - cast_from_half(half_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_half(half_t); -template -KOKKOS_INLINE_FUNCTION - std::enable_if_t::value, T> - cast_from_half(half_t); -template -KOKKOS_INLINE_FUNCTION - std::enable_if_t::value, T> - cast_from_half(half_t); - -// declare bhalf_t -#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED -using bhalf_t = Kokkos::Experimental::Impl::floating_point_wrapper< - Kokkos::Impl ::bhalf_impl_t ::type>; - -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(float val); -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(bool val); -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(double val); -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(short val); -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(int val); -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(long val); -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(long long val); -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(unsigned short val); -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(unsigned int val); -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(unsigned long val); -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(unsigned long long val); -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(bhalf_t val); - -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_bhalf(bhalf_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_bhalf(bhalf_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_bhalf(bhalf_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_bhalf(bhalf_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_bhalf(bhalf_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_bhalf(bhalf_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_bhalf(bhalf_t); -template -KOKKOS_INLINE_FUNCTION - std::enable_if_t::value, T> - cast_from_bhalf(bhalf_t); -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> - cast_from_bhalf(bhalf_t); -template -KOKKOS_INLINE_FUNCTION - std::enable_if_t::value, T> - cast_from_bhalf(bhalf_t); -template -KOKKOS_INLINE_FUNCTION - std::enable_if_t::value, T> - cast_from_bhalf(bhalf_t); -#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED - -template -static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::half_impl_t::type&); - -#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED -template -static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::bhalf_impl_t::type&); -#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED - -template -static KOKKOS_INLINE_FUNCTION T -cast_from_wrapper(const Kokkos::Experimental::half_t& x); - -#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED -template -static KOKKOS_INLINE_FUNCTION T -cast_from_wrapper(const Kokkos::Experimental::bhalf_t& x); -#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED -/************************** END forward declarations **************************/ - -namespace Impl { -template -class alignas(FloatType) floating_point_wrapper { - public: - using impl_type = FloatType; - - private: - impl_type val; - using fixed_width_integer_type = std::conditional_t< - sizeof(impl_type) == 2, uint16_t, - std::conditional_t< - sizeof(impl_type) == 4, uint32_t, - std::conditional_t>>; - static_assert(!std::is_void::value, - "Invalid impl_type"); - - public: - // In-class initialization and defaulted default constructors not used - // since Cuda supports half precision initialization via the below constructor - KOKKOS_FUNCTION - floating_point_wrapper() : val(0.0F) {} - -// Copy constructors -// Getting "C2580: multiple versions of a defaulted special -// member function are not allowed" with VS 16.11.3 and CUDA 11.4.2 -#if defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA) - KOKKOS_FUNCTION - floating_point_wrapper(const floating_point_wrapper& rhs) : val(rhs.val) {} - - KOKKOS_FUNCTION - floating_point_wrapper& operator=(const floating_point_wrapper& rhs) { - val = rhs.val; - return *this; - } -#else - KOKKOS_DEFAULTED_FUNCTION - floating_point_wrapper(const floating_point_wrapper&) noexcept = default; - - KOKKOS_DEFAULTED_FUNCTION - floating_point_wrapper& operator=(const floating_point_wrapper&) noexcept = - default; -#endif - - KOKKOS_INLINE_FUNCTION - floating_point_wrapper(const volatile floating_point_wrapper& rhs) { -#if defined(KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH) && !defined(KOKKOS_ENABLE_SYCL) - val = rhs.val; -#else - const volatile fixed_width_integer_type* rv_ptr = - reinterpret_cast(&rhs.val); - const fixed_width_integer_type rv_val = *rv_ptr; - val = reinterpret_cast(rv_val); -#endif // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - } - - // Don't support implicit conversion back to impl_type. - // impl_type is a storage only type on host. - KOKKOS_FUNCTION - explicit operator impl_type() const { return val; } - KOKKOS_FUNCTION - explicit operator float() const { return cast_from_wrapper(*this); } - KOKKOS_FUNCTION - explicit operator bool() const { return cast_from_wrapper(*this); } - KOKKOS_FUNCTION - explicit operator double() const { return cast_from_wrapper(*this); } - KOKKOS_FUNCTION - explicit operator short() const { return cast_from_wrapper(*this); } - KOKKOS_FUNCTION - explicit operator int() const { return cast_from_wrapper(*this); } - KOKKOS_FUNCTION - explicit operator long() const { return cast_from_wrapper(*this); } - KOKKOS_FUNCTION - explicit operator long long() const { - return cast_from_wrapper(*this); - } - KOKKOS_FUNCTION - explicit operator unsigned short() const { - return cast_from_wrapper(*this); - } - KOKKOS_FUNCTION - explicit operator unsigned int() const { - return cast_from_wrapper(*this); - } - KOKKOS_FUNCTION - explicit operator unsigned long() const { - return cast_from_wrapper(*this); - } - KOKKOS_FUNCTION - explicit operator unsigned long long() const { - return cast_from_wrapper(*this); - } - - /** - * Conversion constructors. - * - * Support implicit conversions from impl_type, float, double -> - * floating_point_wrapper. Mixed precision expressions require upcasting which - * is done in the - * "// Binary Arithmetic" operator overloads below. - * - * Support implicit conversions from integral types -> floating_point_wrapper. - * Expressions involving floating_point_wrapper with integral types require - * downcasting the integral types to floating_point_wrapper. Existing operator - * overloads can handle this with the addition of the below implicit - * conversion constructors. - */ - KOKKOS_FUNCTION - constexpr floating_point_wrapper(impl_type rhs) : val(rhs) {} - KOKKOS_FUNCTION - floating_point_wrapper(float rhs) : val(cast_to_wrapper(rhs, val).val) {} - KOKKOS_FUNCTION - floating_point_wrapper(double rhs) : val(cast_to_wrapper(rhs, val).val) {} - KOKKOS_FUNCTION - explicit floating_point_wrapper(bool rhs) - : val(cast_to_wrapper(rhs, val).val) {} - KOKKOS_FUNCTION - floating_point_wrapper(short rhs) : val(cast_to_wrapper(rhs, val).val) {} - KOKKOS_FUNCTION - floating_point_wrapper(int rhs) : val(cast_to_wrapper(rhs, val).val) {} - KOKKOS_FUNCTION - floating_point_wrapper(long rhs) : val(cast_to_wrapper(rhs, val).val) {} - KOKKOS_FUNCTION - floating_point_wrapper(long long rhs) : val(cast_to_wrapper(rhs, val).val) {} - KOKKOS_FUNCTION - floating_point_wrapper(unsigned short rhs) - : val(cast_to_wrapper(rhs, val).val) {} - KOKKOS_FUNCTION - floating_point_wrapper(unsigned int rhs) - : val(cast_to_wrapper(rhs, val).val) {} - KOKKOS_FUNCTION - floating_point_wrapper(unsigned long rhs) - : val(cast_to_wrapper(rhs, val).val) {} - KOKKOS_FUNCTION - floating_point_wrapper(unsigned long long rhs) - : val(cast_to_wrapper(rhs, val).val) {} - - // Unary operators - KOKKOS_FUNCTION - floating_point_wrapper operator+() const { - floating_point_wrapper tmp = *this; -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - tmp.val = +tmp.val; -#else - tmp.val = cast_to_wrapper(+cast_from_wrapper(tmp), val).val; -#endif - return tmp; - } - - KOKKOS_FUNCTION - floating_point_wrapper operator-() const { - floating_point_wrapper tmp = *this; -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - tmp.val = -tmp.val; -#else - tmp.val = cast_to_wrapper(-cast_from_wrapper(tmp), val).val; -#endif - return tmp; - } - - // Prefix operators - KOKKOS_FUNCTION - floating_point_wrapper& operator++() { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - val = val + impl_type(1.0F); // cuda has no operator++ for __nv_bfloat -#else - float tmp = cast_from_wrapper(*this); - ++tmp; - val = cast_to_wrapper(tmp, val).val; -#endif - return *this; - } - - KOKKOS_FUNCTION - floating_point_wrapper& operator--() { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - val = val - impl_type(1.0F); // cuda has no operator-- for __nv_bfloat -#else - float tmp = cast_from_wrapper(*this); - --tmp; - val = cast_to_wrapper(tmp, val).val; -#endif - return *this; - } - - // Postfix operators - KOKKOS_FUNCTION - floating_point_wrapper operator++(int) { - floating_point_wrapper tmp = *this; - operator++(); - return tmp; - } - - KOKKOS_FUNCTION - floating_point_wrapper operator--(int) { - floating_point_wrapper tmp = *this; - operator--(); - return tmp; - } - - // Binary operators - KOKKOS_FUNCTION - floating_point_wrapper& operator=(impl_type rhs) { - val = rhs; - return *this; - } - - template - KOKKOS_FUNCTION floating_point_wrapper& operator=(T rhs) { - val = cast_to_wrapper(rhs, val).val; - return *this; - } - - template - KOKKOS_FUNCTION void operator=(T rhs) volatile { - impl_type new_val = cast_to_wrapper(rhs, val).val; - volatile fixed_width_integer_type* val_ptr = - reinterpret_cast( - const_cast(&val)); - *val_ptr = reinterpret_cast(new_val); - } - - // Compound operators - KOKKOS_FUNCTION - floating_point_wrapper& operator+=(floating_point_wrapper rhs) { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - val = val + rhs.val; // cuda has no operator+= for __nv_bfloat -#else - val = cast_to_wrapper( - cast_from_wrapper(*this) + cast_from_wrapper(rhs), - val) - .val; -#endif - return *this; - } - - KOKKOS_FUNCTION - void operator+=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs += tmp_rhs; - *this = tmp_lhs; - } - - // Compound operators: upcast overloads for += - template - KOKKOS_FUNCTION friend std::enable_if_t< - std::is_same::value || std::is_same::value, T> - operator+=(T& lhs, floating_point_wrapper rhs) { - lhs += static_cast(rhs); - return lhs; - } - - KOKKOS_FUNCTION - floating_point_wrapper& operator+=(float rhs) { - float result = static_cast(val) + rhs; - val = static_cast(result); - return *this; - } - - KOKKOS_FUNCTION - floating_point_wrapper& operator+=(double rhs) { - double result = static_cast(val) + rhs; - val = static_cast(result); - return *this; - } - - KOKKOS_FUNCTION - floating_point_wrapper& operator-=(floating_point_wrapper rhs) { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - val = val - rhs.val; // cuda has no operator-= for __nv_bfloat -#else - val = cast_to_wrapper( - cast_from_wrapper(*this) - cast_from_wrapper(rhs), - val) - .val; -#endif - return *this; - } - - KOKKOS_FUNCTION - void operator-=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs -= tmp_rhs; - *this = tmp_lhs; - } - - // Compund operators: upcast overloads for -= - template - KOKKOS_FUNCTION friend std::enable_if_t< - std::is_same::value || std::is_same::value, T> - operator-=(T& lhs, floating_point_wrapper rhs) { - lhs -= static_cast(rhs); - return lhs; - } - - KOKKOS_FUNCTION - floating_point_wrapper& operator-=(float rhs) { - float result = static_cast(val) - rhs; - val = static_cast(result); - return *this; - } - - KOKKOS_FUNCTION - floating_point_wrapper& operator-=(double rhs) { - double result = static_cast(val) - rhs; - val = static_cast(result); - return *this; - } - - KOKKOS_FUNCTION - floating_point_wrapper& operator*=(floating_point_wrapper rhs) { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - val = val * rhs.val; // cuda has no operator*= for __nv_bfloat -#else - val = cast_to_wrapper( - cast_from_wrapper(*this) * cast_from_wrapper(rhs), - val) - .val; -#endif - return *this; - } - - KOKKOS_FUNCTION - void operator*=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs *= tmp_rhs; - *this = tmp_lhs; - } - - // Compund operators: upcast overloads for *= - template - KOKKOS_FUNCTION friend std::enable_if_t< - std::is_same::value || std::is_same::value, T> - operator*=(T& lhs, floating_point_wrapper rhs) { - lhs *= static_cast(rhs); - return lhs; - } - - KOKKOS_FUNCTION - floating_point_wrapper& operator*=(float rhs) { - float result = static_cast(val) * rhs; - val = static_cast(result); - return *this; - } - - KOKKOS_FUNCTION - floating_point_wrapper& operator*=(double rhs) { - double result = static_cast(val) * rhs; - val = static_cast(result); - return *this; - } - - KOKKOS_FUNCTION - floating_point_wrapper& operator/=(floating_point_wrapper rhs) { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - val = val / rhs.val; // cuda has no operator/= for __nv_bfloat -#else - val = cast_to_wrapper( - cast_from_wrapper(*this) / cast_from_wrapper(rhs), - val) - .val; -#endif - return *this; - } - - KOKKOS_FUNCTION - void operator/=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs /= tmp_rhs; - *this = tmp_lhs; - } - - // Compund operators: upcast overloads for /= - template - KOKKOS_FUNCTION friend std::enable_if_t< - std::is_same::value || std::is_same::value, T> - operator/=(T& lhs, floating_point_wrapper rhs) { - lhs /= static_cast(rhs); - return lhs; - } - - KOKKOS_FUNCTION - floating_point_wrapper& operator/=(float rhs) { - float result = static_cast(val) / rhs; - val = static_cast(result); - return *this; - } - - KOKKOS_FUNCTION - floating_point_wrapper& operator/=(double rhs) { - double result = static_cast(val) / rhs; - val = static_cast(result); - return *this; - } - - // Binary Arithmetic - KOKKOS_FUNCTION - friend floating_point_wrapper operator+(floating_point_wrapper lhs, - floating_point_wrapper rhs) { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - lhs += rhs; -#else - lhs.val = cast_to_wrapper( - cast_from_wrapper(lhs) + cast_from_wrapper(rhs), - lhs.val) - .val; -#endif - return lhs; - } - - // Binary Arithmetic upcast operators for + - template - KOKKOS_FUNCTION friend std::enable_if_t< - std::is_same::value || std::is_same::value, T> - operator+(floating_point_wrapper lhs, T rhs) { - return T(lhs) + rhs; - } - - template - KOKKOS_FUNCTION friend std::enable_if_t< - std::is_same::value || std::is_same::value, T> - operator+(T lhs, floating_point_wrapper rhs) { - return lhs + T(rhs); - } - - KOKKOS_FUNCTION - friend floating_point_wrapper operator-(floating_point_wrapper lhs, - floating_point_wrapper rhs) { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - lhs -= rhs; -#else - lhs.val = cast_to_wrapper( - cast_from_wrapper(lhs) - cast_from_wrapper(rhs), - lhs.val) - .val; -#endif - return lhs; - } - - // Binary Arithmetic upcast operators for - - template - KOKKOS_FUNCTION friend std::enable_if_t< - std::is_same::value || std::is_same::value, T> - operator-(floating_point_wrapper lhs, T rhs) { - return T(lhs) - rhs; - } - - template - KOKKOS_FUNCTION friend std::enable_if_t< - std::is_same::value || std::is_same::value, T> - operator-(T lhs, floating_point_wrapper rhs) { - return lhs - T(rhs); - } - - KOKKOS_FUNCTION - friend floating_point_wrapper operator*(floating_point_wrapper lhs, - floating_point_wrapper rhs) { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - lhs *= rhs; -#else - lhs.val = cast_to_wrapper( - cast_from_wrapper(lhs) * cast_from_wrapper(rhs), - lhs.val) - .val; -#endif - return lhs; - } - - // Binary Arithmetic upcast operators for * - template - KOKKOS_FUNCTION friend std::enable_if_t< - std::is_same::value || std::is_same::value, T> - operator*(floating_point_wrapper lhs, T rhs) { - return T(lhs) * rhs; - } - - template - KOKKOS_FUNCTION friend std::enable_if_t< - std::is_same::value || std::is_same::value, T> - operator*(T lhs, floating_point_wrapper rhs) { - return lhs * T(rhs); - } - - KOKKOS_FUNCTION - friend floating_point_wrapper operator/(floating_point_wrapper lhs, - floating_point_wrapper rhs) { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - lhs /= rhs; -#else - lhs.val = cast_to_wrapper( - cast_from_wrapper(lhs) / cast_from_wrapper(rhs), - lhs.val) - .val; -#endif - return lhs; - } - - // Binary Arithmetic upcast operators for / - template - KOKKOS_FUNCTION friend std::enable_if_t< - std::is_same::value || std::is_same::value, T> - operator/(floating_point_wrapper lhs, T rhs) { - return T(lhs) / rhs; - } - - template - KOKKOS_FUNCTION friend std::enable_if_t< - std::is_same::value || std::is_same::value, T> - operator/(T lhs, floating_point_wrapper rhs) { - return lhs / T(rhs); - } - - // Logical operators - KOKKOS_FUNCTION - bool operator!() const { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - return static_cast(!val); -#else - return !cast_from_wrapper(*this); -#endif - } - - // NOTE: Loses short-circuit evaluation - KOKKOS_FUNCTION - bool operator&&(floating_point_wrapper rhs) const { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - return static_cast(val && rhs.val); -#else - return cast_from_wrapper(*this) && cast_from_wrapper(rhs); -#endif - } - - // NOTE: Loses short-circuit evaluation - KOKKOS_FUNCTION - bool operator||(floating_point_wrapper rhs) const { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - return static_cast(val || rhs.val); -#else - return cast_from_wrapper(*this) || cast_from_wrapper(rhs); -#endif - } - - // Comparison operators - KOKKOS_FUNCTION - bool operator==(floating_point_wrapper rhs) const { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - return static_cast(val == rhs.val); -#else - return cast_from_wrapper(*this) == cast_from_wrapper(rhs); -#endif - } - - KOKKOS_FUNCTION - bool operator!=(floating_point_wrapper rhs) const { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - return static_cast(val != rhs.val); -#else - return cast_from_wrapper(*this) != cast_from_wrapper(rhs); -#endif - } - - KOKKOS_FUNCTION - bool operator<(floating_point_wrapper rhs) const { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - return static_cast(val < rhs.val); -#else - return cast_from_wrapper(*this) < cast_from_wrapper(rhs); -#endif - } - - KOKKOS_FUNCTION - bool operator>(floating_point_wrapper rhs) const { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - return static_cast(val > rhs.val); -#else - return cast_from_wrapper(*this) > cast_from_wrapper(rhs); -#endif - } - - KOKKOS_FUNCTION - bool operator<=(floating_point_wrapper rhs) const { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - return static_cast(val <= rhs.val); -#else - return cast_from_wrapper(*this) <= cast_from_wrapper(rhs); -#endif - } - - KOKKOS_FUNCTION - bool operator>=(floating_point_wrapper rhs) const { -#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - return static_cast(val >= rhs.val); -#else - return cast_from_wrapper(*this) >= cast_from_wrapper(rhs); -#endif - } - - KOKKOS_FUNCTION - friend bool operator==(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs == tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator!=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs != tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator<(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs < tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator>(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs > tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator<=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs <= tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator>=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs >= tmp_rhs; - } - - // Insertion and extraction operators - friend std::ostream& operator<<(std::ostream& os, - const floating_point_wrapper& x) { - const std::string out = std::to_string(static_cast(x)); - os << out; - return os; - } - - friend std::istream& operator>>(std::istream& is, floating_point_wrapper& x) { - std::string in; - is >> in; - x = std::stod(in); - return is; - } -}; -} // namespace Impl - -// Declare wrapper overloads now that floating_point_wrapper is declared -template -static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::half_impl_t::type&) { - return Kokkos::Experimental::cast_to_half(x); -} - -#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED -template -static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::bhalf_impl_t::type&) { - return Kokkos::Experimental::cast_to_bhalf(x); -} -#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED - -template -static KOKKOS_INLINE_FUNCTION T -cast_from_wrapper(const Kokkos::Experimental::half_t& x) { - return Kokkos::Experimental::cast_from_half(x); -} - -#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED -template -static KOKKOS_INLINE_FUNCTION T -cast_from_wrapper(const Kokkos::Experimental::bhalf_t& x) { - return Kokkos::Experimental::cast_from_bhalf(x); -} -#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED - -} // namespace Experimental -} // namespace Kokkos - -#endif // KOKKOS_IMPL_HALF_TYPE_DEFINED - -// If none of the above actually did anything and defined a half precision type -// define a fallback implementation here using float -#ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED -#define KOKKOS_IMPL_HALF_TYPE_DEFINED -#define KOKKOS_HALF_T_IS_FLOAT true -namespace Kokkos { -namespace Impl { -struct half_impl_t { - using type = float; -}; -} // namespace Impl -namespace Experimental { - -using half_t = Kokkos::Impl::half_impl_t::type; - -// cast_to_half -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(float val) { return half_t(val); } -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(bool val) { return half_t(val); } -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(double val) { return half_t(val); } -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(short val) { return half_t(val); } -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(unsigned short val) { return half_t(val); } -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(int val) { return half_t(val); } -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(unsigned int val) { return half_t(val); } -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(long val) { return half_t(val); } -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(unsigned long val) { return half_t(val); } -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(long long val) { return half_t(val); } -KOKKOS_INLINE_FUNCTION -half_t cast_to_half(unsigned long long val) { return half_t(val); } - -// cast_from_half -// Using an explicit list here too, since the other ones are explicit and for -// example don't include char -template -KOKKOS_INLINE_FUNCTION std::enable_if_t< - std::is_same::value || std::is_same::value || - std::is_same::value || std::is_same::value || - std::is_same::value || std::is_same::value || - std::is_same::value || std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value, - T> -cast_from_half(half_t val) { - return T(val); -} - -} // namespace Experimental -} // namespace Kokkos - -#else -#define KOKKOS_HALF_T_IS_FLOAT false -#endif // KOKKOS_IMPL_HALF_TYPE_DEFINED - -#ifndef KOKKOS_IMPL_BHALF_TYPE_DEFINED -#define KOKKOS_IMPL_BHALF_TYPE_DEFINED -#define KOKKOS_BHALF_T_IS_FLOAT true -namespace Kokkos { -namespace Impl { -struct bhalf_impl_t { - using type = float; -}; -} // namespace Impl - -namespace Experimental { - -using bhalf_t = Kokkos::Impl::bhalf_impl_t::type; - -// cast_to_bhalf -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(float val) { return bhalf_t(val); } -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(bool val) { return bhalf_t(val); } -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(double val) { return bhalf_t(val); } -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(short val) { return bhalf_t(val); } -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(unsigned short val) { return bhalf_t(val); } -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(int val) { return bhalf_t(val); } -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(unsigned int val) { return bhalf_t(val); } -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(long val) { return bhalf_t(val); } -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(unsigned long val) { return bhalf_t(val); } -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(long long val) { return bhalf_t(val); } -KOKKOS_INLINE_FUNCTION -bhalf_t cast_to_bhalf(unsigned long long val) { return bhalf_t(val); } - -// cast_from_bhalf -template -KOKKOS_INLINE_FUNCTION std::enable_if_t< - std::is_same::value || std::is_same::value || - std::is_same::value || std::is_same::value || - std::is_same::value || std::is_same::value || - std::is_same::value || std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value, - T> -cast_from_bhalf(bhalf_t val) { - return T(val); -} -} // namespace Experimental -} // namespace Kokkos -#else -#define KOKKOS_BHALF_T_IS_FLOAT false -#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED - +#include #include #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF diff --git a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp new file mode 100644 index 0000000000..7bf315de17 --- /dev/null +++ b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp @@ -0,0 +1,1016 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HALF_FLOATING_POINT_WRAPPER_HPP_ +#define KOKKOS_HALF_FLOATING_POINT_WRAPPER_HPP_ + +#include + +#include +#include // istream & ostream for extraction and insertion ops +#include + +#ifdef KOKKOS_IMPL_HALF_TYPE_DEFINED + +// KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH: A macro to select which +// floating_pointer_wrapper operator paths should be used. For CUDA, let the +// compiler conditionally select when device ops are used For SYCL, we have a +// full half type on both host and device +#if defined(__CUDA_ARCH__) || defined(KOKKOS_ENABLE_SYCL) +#define KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH +#endif + +/************************* BEGIN forward declarations *************************/ +namespace Kokkos { +namespace Experimental { +namespace Impl { +template +class floating_point_wrapper; +} + +// Declare half_t (binary16) +using half_t = Kokkos::Experimental::Impl::floating_point_wrapper< + Kokkos::Impl::half_impl_t ::type>; +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(float val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(bool val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(double val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(short val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(int val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned short val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned int val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(half_t); + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t); + +// declare bhalf_t +#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED +using bhalf_t = Kokkos::Experimental::Impl::floating_point_wrapper< + Kokkos::Impl ::bhalf_impl_t ::type>; + +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(float val); +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(bool val); +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(double val); +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(short val); +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(int val); +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(long val); +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(long long val); +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(unsigned short val); +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(unsigned int val); +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(unsigned long val); +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(unsigned long long val); +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(bhalf_t val); + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_bhalf(bhalf_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_bhalf(bhalf_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_bhalf(bhalf_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_bhalf(bhalf_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_bhalf(bhalf_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_bhalf(bhalf_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_bhalf(bhalf_t); +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_bhalf(bhalf_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_bhalf(bhalf_t); +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_bhalf(bhalf_t); +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_bhalf(bhalf_t); +#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED + +template +static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( + T x, const volatile Kokkos::Impl::half_impl_t::type&); + +#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED +template +static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( + T x, const volatile Kokkos::Impl::bhalf_impl_t::type&); +#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED + +template +static KOKKOS_INLINE_FUNCTION T +cast_from_wrapper(const Kokkos::Experimental::half_t& x); + +#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED +template +static KOKKOS_INLINE_FUNCTION T +cast_from_wrapper(const Kokkos::Experimental::bhalf_t& x); +#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED +/************************** END forward declarations **************************/ + +namespace Impl { +template +class alignas(FloatType) floating_point_wrapper { + public: + using impl_type = FloatType; + + private: + impl_type val; + using fixed_width_integer_type = std::conditional_t< + sizeof(impl_type) == 2, uint16_t, + std::conditional_t< + sizeof(impl_type) == 4, uint32_t, + std::conditional_t>>; + static_assert(!std::is_void::value, + "Invalid impl_type"); + + public: + // In-class initialization and defaulted default constructors not used + // since Cuda supports half precision initialization via the below constructor + KOKKOS_FUNCTION + floating_point_wrapper() : val(0.0F) {} + +// Copy constructors +// Getting "C2580: multiple versions of a defaulted special +// member function are not allowed" with VS 16.11.3 and CUDA 11.4.2 +#if defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA) + KOKKOS_FUNCTION + floating_point_wrapper(const floating_point_wrapper& rhs) : val(rhs.val) {} + + KOKKOS_FUNCTION + floating_point_wrapper& operator=(const floating_point_wrapper& rhs) { + val = rhs.val; + return *this; + } +#else + KOKKOS_DEFAULTED_FUNCTION + floating_point_wrapper(const floating_point_wrapper&) noexcept = default; + + KOKKOS_DEFAULTED_FUNCTION + floating_point_wrapper& operator=(const floating_point_wrapper&) noexcept = + default; +#endif + + KOKKOS_INLINE_FUNCTION + floating_point_wrapper(const volatile floating_point_wrapper& rhs) { +#if defined(KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH) && !defined(KOKKOS_ENABLE_SYCL) + val = rhs.val; +#else + const volatile fixed_width_integer_type* rv_ptr = + reinterpret_cast(&rhs.val); + const fixed_width_integer_type rv_val = *rv_ptr; + val = reinterpret_cast(rv_val); +#endif // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + } + + // Don't support implicit conversion back to impl_type. + // impl_type is a storage only type on host. + KOKKOS_FUNCTION + explicit operator impl_type() const { return val; } + KOKKOS_FUNCTION + explicit operator float() const { return cast_from_wrapper(*this); } + KOKKOS_FUNCTION + explicit operator bool() const { return cast_from_wrapper(*this); } + KOKKOS_FUNCTION + explicit operator double() const { return cast_from_wrapper(*this); } + KOKKOS_FUNCTION + explicit operator short() const { return cast_from_wrapper(*this); } + KOKKOS_FUNCTION + explicit operator int() const { return cast_from_wrapper(*this); } + KOKKOS_FUNCTION + explicit operator long() const { return cast_from_wrapper(*this); } + KOKKOS_FUNCTION + explicit operator long long() const { + return cast_from_wrapper(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned short() const { + return cast_from_wrapper(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned int() const { + return cast_from_wrapper(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned long() const { + return cast_from_wrapper(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned long long() const { + return cast_from_wrapper(*this); + } + + /** + * Conversion constructors. + * + * Support implicit conversions from impl_type, float, double -> + * floating_point_wrapper. Mixed precision expressions require upcasting which + * is done in the + * "// Binary Arithmetic" operator overloads below. + * + * Support implicit conversions from integral types -> floating_point_wrapper. + * Expressions involving floating_point_wrapper with integral types require + * downcasting the integral types to floating_point_wrapper. Existing operator + * overloads can handle this with the addition of the below implicit + * conversion constructors. + */ + KOKKOS_FUNCTION + constexpr floating_point_wrapper(impl_type rhs) : val(rhs) {} + KOKKOS_FUNCTION + floating_point_wrapper(float rhs) : val(cast_to_wrapper(rhs, val).val) {} + KOKKOS_FUNCTION + floating_point_wrapper(double rhs) : val(cast_to_wrapper(rhs, val).val) {} + KOKKOS_FUNCTION + explicit floating_point_wrapper(bool rhs) + : val(cast_to_wrapper(rhs, val).val) {} + KOKKOS_FUNCTION + floating_point_wrapper(short rhs) : val(cast_to_wrapper(rhs, val).val) {} + KOKKOS_FUNCTION + floating_point_wrapper(int rhs) : val(cast_to_wrapper(rhs, val).val) {} + KOKKOS_FUNCTION + floating_point_wrapper(long rhs) : val(cast_to_wrapper(rhs, val).val) {} + KOKKOS_FUNCTION + floating_point_wrapper(long long rhs) : val(cast_to_wrapper(rhs, val).val) {} + KOKKOS_FUNCTION + floating_point_wrapper(unsigned short rhs) + : val(cast_to_wrapper(rhs, val).val) {} + KOKKOS_FUNCTION + floating_point_wrapper(unsigned int rhs) + : val(cast_to_wrapper(rhs, val).val) {} + KOKKOS_FUNCTION + floating_point_wrapper(unsigned long rhs) + : val(cast_to_wrapper(rhs, val).val) {} + KOKKOS_FUNCTION + floating_point_wrapper(unsigned long long rhs) + : val(cast_to_wrapper(rhs, val).val) {} + + // Unary operators + KOKKOS_FUNCTION + floating_point_wrapper operator+() const { + floating_point_wrapper tmp = *this; +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + tmp.val = +tmp.val; +#else + tmp.val = cast_to_wrapper(+cast_from_wrapper(tmp), val).val; +#endif + return tmp; + } + + KOKKOS_FUNCTION + floating_point_wrapper operator-() const { + floating_point_wrapper tmp = *this; +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + tmp.val = -tmp.val; +#else + tmp.val = cast_to_wrapper(-cast_from_wrapper(tmp), val).val; +#endif + return tmp; + } + + // Prefix operators + KOKKOS_FUNCTION + floating_point_wrapper& operator++() { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + val = val + impl_type(1.0F); // cuda has no operator++ for __nv_bfloat +#else + float tmp = cast_from_wrapper(*this); + ++tmp; + val = cast_to_wrapper(tmp, val).val; +#endif + return *this; + } + + KOKKOS_FUNCTION + floating_point_wrapper& operator--() { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + val = val - impl_type(1.0F); // cuda has no operator-- for __nv_bfloat +#else + float tmp = cast_from_wrapper(*this); + --tmp; + val = cast_to_wrapper(tmp, val).val; +#endif + return *this; + } + + // Postfix operators + KOKKOS_FUNCTION + floating_point_wrapper operator++(int) { + floating_point_wrapper tmp = *this; + operator++(); + return tmp; + } + + KOKKOS_FUNCTION + floating_point_wrapper operator--(int) { + floating_point_wrapper tmp = *this; + operator--(); + return tmp; + } + + // Binary operators + KOKKOS_FUNCTION + floating_point_wrapper& operator=(impl_type rhs) { + val = rhs; + return *this; + } + + template + KOKKOS_FUNCTION floating_point_wrapper& operator=(T rhs) { + val = cast_to_wrapper(rhs, val).val; + return *this; + } + + template + KOKKOS_FUNCTION void operator=(T rhs) volatile { + impl_type new_val = cast_to_wrapper(rhs, val).val; + volatile fixed_width_integer_type* val_ptr = + reinterpret_cast( + const_cast(&val)); + *val_ptr = reinterpret_cast(new_val); + } + + // Compound operators + KOKKOS_FUNCTION + floating_point_wrapper& operator+=(floating_point_wrapper rhs) { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + val = val + rhs.val; // cuda has no operator+= for __nv_bfloat +#else + val = cast_to_wrapper( + cast_from_wrapper(*this) + cast_from_wrapper(rhs), + val) + .val; +#endif + return *this; + } + + KOKKOS_FUNCTION + void operator+=(const volatile floating_point_wrapper& rhs) volatile { + floating_point_wrapper tmp_rhs = rhs; + floating_point_wrapper tmp_lhs = *this; + + tmp_lhs += tmp_rhs; + *this = tmp_lhs; + } + + // Compound operators: upcast overloads for += + template + KOKKOS_FUNCTION friend std::enable_if_t< + std::is_same::value || std::is_same::value, T> + operator+=(T& lhs, floating_point_wrapper rhs) { + lhs += static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + floating_point_wrapper& operator+=(float rhs) { + float result = static_cast(val) + rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + floating_point_wrapper& operator+=(double rhs) { + double result = static_cast(val) + rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + floating_point_wrapper& operator-=(floating_point_wrapper rhs) { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + val = val - rhs.val; // cuda has no operator-= for __nv_bfloat +#else + val = cast_to_wrapper( + cast_from_wrapper(*this) - cast_from_wrapper(rhs), + val) + .val; +#endif + return *this; + } + + KOKKOS_FUNCTION + void operator-=(const volatile floating_point_wrapper& rhs) volatile { + floating_point_wrapper tmp_rhs = rhs; + floating_point_wrapper tmp_lhs = *this; + + tmp_lhs -= tmp_rhs; + *this = tmp_lhs; + } + + // Compund operators: upcast overloads for -= + template + KOKKOS_FUNCTION friend std::enable_if_t< + std::is_same::value || std::is_same::value, T> + operator-=(T& lhs, floating_point_wrapper rhs) { + lhs -= static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + floating_point_wrapper& operator-=(float rhs) { + float result = static_cast(val) - rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + floating_point_wrapper& operator-=(double rhs) { + double result = static_cast(val) - rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + floating_point_wrapper& operator*=(floating_point_wrapper rhs) { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + val = val * rhs.val; // cuda has no operator*= for __nv_bfloat +#else + val = cast_to_wrapper( + cast_from_wrapper(*this) * cast_from_wrapper(rhs), + val) + .val; +#endif + return *this; + } + + KOKKOS_FUNCTION + void operator*=(const volatile floating_point_wrapper& rhs) volatile { + floating_point_wrapper tmp_rhs = rhs; + floating_point_wrapper tmp_lhs = *this; + + tmp_lhs *= tmp_rhs; + *this = tmp_lhs; + } + + // Compund operators: upcast overloads for *= + template + KOKKOS_FUNCTION friend std::enable_if_t< + std::is_same::value || std::is_same::value, T> + operator*=(T& lhs, floating_point_wrapper rhs) { + lhs *= static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + floating_point_wrapper& operator*=(float rhs) { + float result = static_cast(val) * rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + floating_point_wrapper& operator*=(double rhs) { + double result = static_cast(val) * rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + floating_point_wrapper& operator/=(floating_point_wrapper rhs) { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + val = val / rhs.val; // cuda has no operator/= for __nv_bfloat +#else + val = cast_to_wrapper( + cast_from_wrapper(*this) / cast_from_wrapper(rhs), + val) + .val; +#endif + return *this; + } + + KOKKOS_FUNCTION + void operator/=(const volatile floating_point_wrapper& rhs) volatile { + floating_point_wrapper tmp_rhs = rhs; + floating_point_wrapper tmp_lhs = *this; + + tmp_lhs /= tmp_rhs; + *this = tmp_lhs; + } + + // Compund operators: upcast overloads for /= + template + KOKKOS_FUNCTION friend std::enable_if_t< + std::is_same::value || std::is_same::value, T> + operator/=(T& lhs, floating_point_wrapper rhs) { + lhs /= static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + floating_point_wrapper& operator/=(float rhs) { + float result = static_cast(val) / rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + floating_point_wrapper& operator/=(double rhs) { + double result = static_cast(val) / rhs; + val = static_cast(result); + return *this; + } + + // Binary Arithmetic + KOKKOS_FUNCTION + friend floating_point_wrapper operator+(floating_point_wrapper lhs, + floating_point_wrapper rhs) { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + lhs += rhs; +#else + lhs.val = cast_to_wrapper( + cast_from_wrapper(lhs) + cast_from_wrapper(rhs), + lhs.val) + .val; +#endif + return lhs; + } + + // Binary Arithmetic upcast operators for + + template + KOKKOS_FUNCTION friend std::enable_if_t< + std::is_same::value || std::is_same::value, T> + operator+(floating_point_wrapper lhs, T rhs) { + return T(lhs) + rhs; + } + + template + KOKKOS_FUNCTION friend std::enable_if_t< + std::is_same::value || std::is_same::value, T> + operator+(T lhs, floating_point_wrapper rhs) { + return lhs + T(rhs); + } + + KOKKOS_FUNCTION + friend floating_point_wrapper operator-(floating_point_wrapper lhs, + floating_point_wrapper rhs) { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + lhs -= rhs; +#else + lhs.val = cast_to_wrapper( + cast_from_wrapper(lhs) - cast_from_wrapper(rhs), + lhs.val) + .val; +#endif + return lhs; + } + + // Binary Arithmetic upcast operators for - + template + KOKKOS_FUNCTION friend std::enable_if_t< + std::is_same::value || std::is_same::value, T> + operator-(floating_point_wrapper lhs, T rhs) { + return T(lhs) - rhs; + } + + template + KOKKOS_FUNCTION friend std::enable_if_t< + std::is_same::value || std::is_same::value, T> + operator-(T lhs, floating_point_wrapper rhs) { + return lhs - T(rhs); + } + + KOKKOS_FUNCTION + friend floating_point_wrapper operator*(floating_point_wrapper lhs, + floating_point_wrapper rhs) { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + lhs *= rhs; +#else + lhs.val = cast_to_wrapper( + cast_from_wrapper(lhs) * cast_from_wrapper(rhs), + lhs.val) + .val; +#endif + return lhs; + } + + // Binary Arithmetic upcast operators for * + template + KOKKOS_FUNCTION friend std::enable_if_t< + std::is_same::value || std::is_same::value, T> + operator*(floating_point_wrapper lhs, T rhs) { + return T(lhs) * rhs; + } + + template + KOKKOS_FUNCTION friend std::enable_if_t< + std::is_same::value || std::is_same::value, T> + operator*(T lhs, floating_point_wrapper rhs) { + return lhs * T(rhs); + } + + KOKKOS_FUNCTION + friend floating_point_wrapper operator/(floating_point_wrapper lhs, + floating_point_wrapper rhs) { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + lhs /= rhs; +#else + lhs.val = cast_to_wrapper( + cast_from_wrapper(lhs) / cast_from_wrapper(rhs), + lhs.val) + .val; +#endif + return lhs; + } + + // Binary Arithmetic upcast operators for / + template + KOKKOS_FUNCTION friend std::enable_if_t< + std::is_same::value || std::is_same::value, T> + operator/(floating_point_wrapper lhs, T rhs) { + return T(lhs) / rhs; + } + + template + KOKKOS_FUNCTION friend std::enable_if_t< + std::is_same::value || std::is_same::value, T> + operator/(T lhs, floating_point_wrapper rhs) { + return lhs / T(rhs); + } + + // Logical operators + KOKKOS_FUNCTION + bool operator!() const { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + return static_cast(!val); +#else + return !cast_from_wrapper(*this); +#endif + } + + // NOTE: Loses short-circuit evaluation + KOKKOS_FUNCTION + bool operator&&(floating_point_wrapper rhs) const { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + return static_cast(val && rhs.val); +#else + return cast_from_wrapper(*this) && cast_from_wrapper(rhs); +#endif + } + + // NOTE: Loses short-circuit evaluation + KOKKOS_FUNCTION + bool operator||(floating_point_wrapper rhs) const { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + return static_cast(val || rhs.val); +#else + return cast_from_wrapper(*this) || cast_from_wrapper(rhs); +#endif + } + + // Comparison operators + KOKKOS_FUNCTION + bool operator==(floating_point_wrapper rhs) const { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + return static_cast(val == rhs.val); +#else + return cast_from_wrapper(*this) == cast_from_wrapper(rhs); +#endif + } + + KOKKOS_FUNCTION + bool operator!=(floating_point_wrapper rhs) const { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + return static_cast(val != rhs.val); +#else + return cast_from_wrapper(*this) != cast_from_wrapper(rhs); +#endif + } + + KOKKOS_FUNCTION + bool operator<(floating_point_wrapper rhs) const { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + return static_cast(val < rhs.val); +#else + return cast_from_wrapper(*this) < cast_from_wrapper(rhs); +#endif + } + + KOKKOS_FUNCTION + bool operator>(floating_point_wrapper rhs) const { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + return static_cast(val > rhs.val); +#else + return cast_from_wrapper(*this) > cast_from_wrapper(rhs); +#endif + } + + KOKKOS_FUNCTION + bool operator<=(floating_point_wrapper rhs) const { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + return static_cast(val <= rhs.val); +#else + return cast_from_wrapper(*this) <= cast_from_wrapper(rhs); +#endif + } + + KOKKOS_FUNCTION + bool operator>=(floating_point_wrapper rhs) const { +#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH + return static_cast(val >= rhs.val); +#else + return cast_from_wrapper(*this) >= cast_from_wrapper(rhs); +#endif + } + + KOKKOS_FUNCTION + friend bool operator==(const volatile floating_point_wrapper& lhs, + const volatile floating_point_wrapper& rhs) { + floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; + return tmp_lhs == tmp_rhs; + } + + KOKKOS_FUNCTION + friend bool operator!=(const volatile floating_point_wrapper& lhs, + const volatile floating_point_wrapper& rhs) { + floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; + return tmp_lhs != tmp_rhs; + } + + KOKKOS_FUNCTION + friend bool operator<(const volatile floating_point_wrapper& lhs, + const volatile floating_point_wrapper& rhs) { + floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; + return tmp_lhs < tmp_rhs; + } + + KOKKOS_FUNCTION + friend bool operator>(const volatile floating_point_wrapper& lhs, + const volatile floating_point_wrapper& rhs) { + floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; + return tmp_lhs > tmp_rhs; + } + + KOKKOS_FUNCTION + friend bool operator<=(const volatile floating_point_wrapper& lhs, + const volatile floating_point_wrapper& rhs) { + floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; + return tmp_lhs <= tmp_rhs; + } + + KOKKOS_FUNCTION + friend bool operator>=(const volatile floating_point_wrapper& lhs, + const volatile floating_point_wrapper& rhs) { + floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; + return tmp_lhs >= tmp_rhs; + } + + // Insertion and extraction operators + friend std::ostream& operator<<(std::ostream& os, + const floating_point_wrapper& x) { + const std::string out = std::to_string(static_cast(x)); + os << out; + return os; + } + + friend std::istream& operator>>(std::istream& is, floating_point_wrapper& x) { + std::string in; + is >> in; + x = std::stod(in); + return is; + } +}; +} // namespace Impl + +// Declare wrapper overloads now that floating_point_wrapper is declared +template +static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( + T x, const volatile Kokkos::Impl::half_impl_t::type&) { + return Kokkos::Experimental::cast_to_half(x); +} + +#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED +template +static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( + T x, const volatile Kokkos::Impl::bhalf_impl_t::type&) { + return Kokkos::Experimental::cast_to_bhalf(x); +} +#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED + +template +static KOKKOS_INLINE_FUNCTION T +cast_from_wrapper(const Kokkos::Experimental::half_t& x) { + return Kokkos::Experimental::cast_from_half(x); +} + +#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED +template +static KOKKOS_INLINE_FUNCTION T +cast_from_wrapper(const Kokkos::Experimental::bhalf_t& x) { + return Kokkos::Experimental::cast_from_bhalf(x); +} +#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED + +} // namespace Experimental +} // namespace Kokkos + +#endif // KOKKOS_IMPL_HALF_TYPE_DEFINED + +// If none of the above actually did anything and defined a half precision type +// define a fallback implementation here using float +#ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED +#define KOKKOS_IMPL_HALF_TYPE_DEFINED +#define KOKKOS_HALF_T_IS_FLOAT true +namespace Kokkos { +namespace Impl { +struct half_impl_t { + using type = float; +}; +} // namespace Impl +namespace Experimental { + +using half_t = Kokkos::Impl::half_impl_t::type; + +// cast_to_half +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(float val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(bool val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(double val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(short val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned short val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(int val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned int val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long long val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long long val) { return half_t(val); } + +// cast_from_half +// Using an explicit list here too, since the other ones are explicit and for +// example don't include char +template +KOKKOS_INLINE_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value, + T> +cast_from_half(half_t val) { + return T(val); +} + +} // namespace Experimental +} // namespace Kokkos + +#else +#define KOKKOS_HALF_T_IS_FLOAT false +#endif // KOKKOS_IMPL_HALF_TYPE_DEFINED + +#ifndef KOKKOS_IMPL_BHALF_TYPE_DEFINED +#define KOKKOS_IMPL_BHALF_TYPE_DEFINED +#define KOKKOS_BHALF_T_IS_FLOAT true +namespace Kokkos { +namespace Impl { +struct bhalf_impl_t { + using type = float; +}; +} // namespace Impl + +namespace Experimental { + +using bhalf_t = Kokkos::Impl::bhalf_impl_t::type; + +// cast_to_bhalf +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(float val) { return bhalf_t(val); } +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(bool val) { return bhalf_t(val); } +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(double val) { return bhalf_t(val); } +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(short val) { return bhalf_t(val); } +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(unsigned short val) { return bhalf_t(val); } +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(int val) { return bhalf_t(val); } +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(unsigned int val) { return bhalf_t(val); } +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(long val) { return bhalf_t(val); } +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(unsigned long val) { return bhalf_t(val); } +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(long long val) { return bhalf_t(val); } +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(unsigned long long val) { return bhalf_t(val); } + +// cast_from_bhalf +template +KOKKOS_INLINE_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value, + T> +cast_from_bhalf(bhalf_t val) { + return T(val); +} +} // namespace Experimental +} // namespace Kokkos +#else +#define KOKKOS_BHALF_T_IS_FLOAT false +#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED + +#endif // KOKKOS_HALF_FLOATING_POINT_WRAPPER_HPP_ From f8ed850dd042301ff41cbbfe913eb593f9b6803b Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 16 May 2023 13:36:00 +0000 Subject: [PATCH 447/496] Disable tests failing with NVHPC --- core/unit_test/TestReducers.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp index b7727e1c09..dd48952c39 100644 --- a/core/unit_test/TestReducers.hpp +++ b/core/unit_test/TestReducers.hpp @@ -1221,14 +1221,20 @@ struct TestReducers { #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); +// FIXME_NVHPC misaligned memory +#if !defined(KOKKOS_COMPILER_NVHPC) test_minloc_2d(100); +#endif #endif test_max(10007); #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); +// FIXME_NVHPC misaligned memory +#if !defined(KOKKOS_COMPILER_NVHPC) test_maxloc_2d(100); #endif +#endif #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \ (KOKKOS_COMPILER_CLANG < 1300) // FIXME_OPENMPTARGET - The minmaxloc test fails llvm <= 13 version. From cbc7e88789d6e72a828fd28b0acff1ad7d3e797b Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 16 May 2023 15:37:07 +0000 Subject: [PATCH 448/496] Fix bit_cast for SYCL again --- core/src/Kokkos_BitManipulation.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index 5fd2c1c6f5..e541f179c8 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -103,7 +103,7 @@ namespace Kokkos { // FIXME_SYCL intel/llvm has unqualified calls to bit_cast which are ambiguous // if we declare our own bit_cast function #ifdef KOKKOS_ENABLE_SYCL -using sycl::bit_cast; +using sycl::detail::bit_cast; #else template KOKKOS_FUNCTION std::enable_if_t Date: Tue, 16 May 2023 16:25:20 +0000 Subject: [PATCH 449/496] Disable tests for OpenMPTarget --- core/unit_test/TestReducers.hpp | 58 +++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp index dd48952c39..cb9a4fe280 100644 --- a/core/unit_test/TestReducers.hpp +++ b/core/unit_test/TestReducers.hpp @@ -1188,23 +1188,33 @@ struct TestReducers { test_sum(10001); test_prod(35); test_min(10003); -#if !defined(KOKKOS_ENABLE_OPENACC) - // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. +#if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not + // support custom reductions. test_minloc(10003); +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET requires custom + // reductions. test_minloc_2d(100); +#endif #endif test_max(10007); -#if !defined(KOKKOS_ENABLE_OPENACC) - // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. +#if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not + // support custom reductions. test_maxloc(10007); +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET requires custom + // reductions. test_maxloc_2d(100); #endif -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \ - (KOKKOS_COMPILER_CLANG < 1300) - // FIXME_OPENMPTARGET - The minmaxloc test fails llvm <= 13 version. +#endif +#if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not + // support custom reductions. +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +#if defined(KOKKOS_COMPILER_CLANG) && \ + (KOKKOS_COMPILER_CLANG >= \ + 1300) // FIXME_OPENMPTARGET - The minmaxloc test fails llvm < 13 version, + // test_minmaxloc_2d requires custom reductions + test_minmaxloc(10007); +#endif #else -#if !defined(KOKKOS_ENABLE_OPENACC) - // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minmaxloc(10007); test_minmaxloc_2d(100); #endif @@ -1218,29 +1228,37 @@ struct TestReducers { test_sum(10001); test_prod(sizeof(Scalar) > 4 ? 35 : 19); // avoid int overflow (see above) test_min(10003); -#if !defined(KOKKOS_ENABLE_OPENACC) - // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. +#if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not + // support custom reductions. test_minloc(10003); -// FIXME_NVHPC misaligned memory -#if !defined(KOKKOS_COMPILER_NVHPC) +#if !defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC misaligned memory +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET requires custom + // reductions. test_minloc_2d(100); #endif +#endif #endif test_max(10007); #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); -// FIXME_NVHPC misaligned memory -#if !defined(KOKKOS_COMPILER_NVHPC) +#if !defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC misaligned memory +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET requires custom + // reductions. test_maxloc_2d(100); #endif #endif -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \ - (KOKKOS_COMPILER_CLANG < 1300) - // FIXME_OPENMPTARGET - The minmaxloc test fails llvm <= 13 version. +#endif +#if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not + // support custom reductions. +#if defined( \ + KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET - The minmaxloc test + // fails llvm <= 13 version, the minmaxloc_2d + // test requires custom reductions. +#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG > 1300) + test_minmaxloc(10007); +#endif #else -#if !defined(KOKKOS_ENABLE_OPENACC) - // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minmaxloc(10007); test_minmaxloc_2d(100); #endif From e8dba15a41cc0b41e4877dc9ddae91ce60d48655 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 16 May 2023 14:45:37 -0400 Subject: [PATCH 450/496] Improve indentation of comments --- core/unit_test/TestReducers.hpp | 56 ++++++++++++++++----------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp index cb9a4fe280..851043d960 100644 --- a/core/unit_test/TestReducers.hpp +++ b/core/unit_test/TestReducers.hpp @@ -1188,30 +1188,29 @@ struct TestReducers { test_sum(10001); test_prod(35); test_min(10003); -#if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not - // support custom reductions. +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); -#if !defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET requires custom - // reductions. +// FIXME_OPENMPTARGET requires custom reductions. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) test_minloc_2d(100); #endif #endif test_max(10007); -#if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not - // support custom reductions. +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); -#if !defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET requires custom - // reductions. +// FIXME_OPENMPTARGET requires custom reductions. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) test_maxloc_2d(100); #endif #endif -#if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not - // support custom reductions. +// FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. +#if !defined(KOKKOS_ENABLE_OPENACC) +// FIXME_OPENMPTARGET - The minmaxloc test fails llvm < 13 version, +// test_minmaxloc_2d requires custom reductions #if defined(KOKKOS_ENABLE_OPENMPTARGET) -#if defined(KOKKOS_COMPILER_CLANG) && \ - (KOKKOS_COMPILER_CLANG >= \ - 1300) // FIXME_OPENMPTARGET - The minmaxloc test fails llvm < 13 version, - // test_minmaxloc_2d requires custom reductions +#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) test_minmaxloc(10007); #endif #else @@ -1228,12 +1227,12 @@ struct TestReducers { test_sum(10001); test_prod(sizeof(Scalar) > 4 ? 35 : 19); // avoid int overflow (see above) test_min(10003); -#if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not - // support custom reductions. +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); -#if !defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC misaligned memory -#if !defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET requires custom - // reductions. +#if !defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC misaligned memory + // FIXME_OPENMPTARGET requires custom reductions. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) test_minloc_2d(100); #endif #endif @@ -1242,20 +1241,19 @@ struct TestReducers { #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); -#if !defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC misaligned memory -#if !defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET requires custom - // reductions. +#if !defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC misaligned memory +// FIXME_OPENMPTARGET requires custom reductions. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) test_maxloc_2d(100); #endif #endif #endif -#if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not - // support custom reductions. -#if defined( \ - KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET - The minmaxloc test - // fails llvm <= 13 version, the minmaxloc_2d - // test requires custom reductions. -#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG > 1300) +// FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. +#if !defined(KOKKOS_ENABLE_OPENACC) +// FIXME_OPENMPTARGET - The minmaxloc test fails llvm < 13 version, +// the minmaxloc_2d test requires custom reductions. +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) test_minmaxloc(10007); #endif #else From bf9c242ed14bc7d8d483d0a824cb18a36e3688bc Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 16 May 2023 14:59:11 -0400 Subject: [PATCH 451/496] Allow deprecated declarations in SYCL+Cuda CI --- .jenkins | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins b/.jenkins index fcbcf56073..c7d8ce533d 100644 --- a/.jenkins +++ b/.jenkins @@ -107,7 +107,7 @@ pipeline { -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER=/opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm/clang++ \ - -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-unknown-cuda-version -Wno-sycl-target" \ + -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Wno-deprecated-declarations -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-unknown-cuda-version -Wno-sycl-target" \ -DKOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED=0 \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ARCH_VOLTA70=ON \ From c67ddea4042d3f3582e59587e9a099883701968c Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 16 May 2023 15:10:28 -0400 Subject: [PATCH 452/496] Try running for other execution spaces --- core/unit_test/TestReducers.hpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp index 851043d960..710d002539 100644 --- a/core/unit_test/TestReducers.hpp +++ b/core/unit_test/TestReducers.hpp @@ -1212,6 +1212,9 @@ struct TestReducers { #if defined(KOKKOS_ENABLE_OPENMPTARGET) #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) test_minmaxloc(10007); +#else + if (!std::is_same_v) + test_minmaxloc(10007); #endif #else test_minmaxloc(10007); @@ -1230,10 +1233,11 @@ struct TestReducers { #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); -#if !defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC misaligned memory +#if defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC misaligned memory + if (!std::is_same_v) // FIXME_OPENMPTARGET requires custom reductions. #if !defined(KOKKOS_ENABLE_OPENMPTARGET) - test_minloc_2d(100); + test_minloc_2d(100); #endif #endif #endif @@ -1242,9 +1246,10 @@ struct TestReducers { // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); #if !defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC misaligned memory + if (!std::is_same_v) // FIXME_OPENMPTARGET requires custom reductions. #if !defined(KOKKOS_ENABLE_OPENMPTARGET) - test_maxloc_2d(100); + test_maxloc_2d(100); #endif #endif #endif @@ -1255,6 +1260,9 @@ struct TestReducers { #if defined(KOKKOS_ENABLE_OPENMPTARGET) #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) test_minmaxloc(10007); +#else + if (!std::is_same_v) + test_minmaxloc(10007); #endif #else test_minmaxloc(10007); From ee43d2a7d76500a99909a82e138c4d4ed6ff1c43 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 16 May 2023 16:32:49 -0400 Subject: [PATCH 453/496] Add guards for Cuda --- core/unit_test/TestReducers.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp index 710d002539..ce71f334e0 100644 --- a/core/unit_test/TestReducers.hpp +++ b/core/unit_test/TestReducers.hpp @@ -1234,7 +1234,9 @@ struct TestReducers { // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); #if defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC misaligned memory +#if defined(KOKKOS_ENABLE_CUDA) if (!std::is_same_v) +#endif // FIXME_OPENMPTARGET requires custom reductions. #if !defined(KOKKOS_ENABLE_OPENMPTARGET) test_minloc_2d(100); @@ -1246,7 +1248,9 @@ struct TestReducers { // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); #if !defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC misaligned memory +#if defined(KOKKOS_ENABLE_CUDA) if (!std::is_same_v) +#endif // FIXME_OPENMPTARGET requires custom reductions. #if !defined(KOKKOS_ENABLE_OPENMPTARGET) test_maxloc_2d(100); @@ -1261,8 +1265,8 @@ struct TestReducers { #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) test_minmaxloc(10007); #else - if (!std::is_same_v) - test_minmaxloc(10007); + if (!std::is_same_v) + test_minmaxloc(10007); #endif #else test_minmaxloc(10007); From 2bcfa51770ba1feefc8b8573d0a4ee4e3a5bd928 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 17 May 2023 09:55:28 -0400 Subject: [PATCH 454/496] Expand list of kokkos options not to export with cmake --- cmake/kokkos_functions.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/kokkos_functions.cmake b/cmake/kokkos_functions.cmake index 55b1ebbf81..9dab1ca00e 100644 --- a/cmake/kokkos_functions.cmake +++ b/cmake/kokkos_functions.cmake @@ -6,7 +6,12 @@ # upper-case version for use within set(Kokkos_OPTIONS_NOT_TO_EXPORT - Kokkos_ENABLE_TESTS Kokkos_ENABLE_EXAMPLES) + Kokkos_ENABLE_BENCHMARKS + Kokkos_ENABLE_EXAMPLES + Kokkos_ENABLE_TESTS + Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS + Kokkos_ENABLE_COMPILER_WARNINGS +) # # From c935774359fb04eecf39671d8882633aef477121 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 17 May 2023 09:56:11 -0400 Subject: [PATCH 455/496] Do not append to Kokkos_OPTIONS variables those in the do not export list --- cmake/kokkos_enable_options.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index 96b9413999..bf7762e1dc 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -11,7 +11,7 @@ FUNCTION(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME}) + IF (KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) LIST(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) #I hate that CMake makes me do this SET(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) From 6b18c2ad55d6f785396a38e08eb93119832d3948 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 18 May 2023 16:48:42 -0400 Subject: [PATCH 456/496] Drop Kokkos_ENABLE_LAUNCH_COMPILER option --- cmake/kokkos_enable_options.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index 96b9413999..fa86742534 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -53,7 +53,6 @@ KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS OFF "Whether to print all compiler war KOKKOS_ENABLE_OPTION(PROFILING_LOAD_PRINT OFF "Whether to print information about which profiling tools got loaded") KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tuning tools") KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") -KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER ON "Whether to potentially use the launch compiler") KOKKOS_ENABLE_OPTION(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time") From 3f565bbb5ebb6d88846e3ad9b9a2aa5248478b92 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 17 May 2023 17:01:28 -0400 Subject: [PATCH 457/496] Export Kokkos_ENABLE_