From a465233ffbdf9ee2d9d10acb189ffa16eaef1ca8 Mon Sep 17 00:00:00 2001 From: Eric Phipps Date: Wed, 3 Jun 2020 15:13:41 -0600 Subject: [PATCH 1/5] Sacado: Clean up matvec, advection, advection_const_basis tests. This add some functionality to these tests so they can fully replace the old tests. --- .../test/performance/advection/advection.cpp | 1 + .../advection/advection_hierarchical.cpp | 17 +++++------- .../advection/advection_hierarchical.hpp | 4 +-- .../test/performance/advection/common.hpp | 1 + .../test/performance/advection/driver.cpp | 10 ++++--- .../advection_const_basis/advection.cpp | 1 + .../advection_hierarchical.cpp | 17 +++++------- .../advection_hierarchical.hpp | 4 +-- .../advection_const_basis/common.hpp | 1 + .../advection_const_basis/driver.cpp | 10 ++++--- .../test/performance/mat_vec/common.hpp | 2 +- .../test/performance/mat_vec/mat_vec.cpp | 27 ++++++++++++------- 12 files changed, 55 insertions(+), 40 deletions(-) diff --git a/packages/sacado/test/performance/advection/advection.cpp b/packages/sacado/test/performance/advection/advection.cpp index 3446b4254722..7d9b7c9492d6 100644 --- a/packages/sacado/test/performance/advection/advection.cpp +++ b/packages/sacado/test/performance/advection/advection.cpp @@ -399,6 +399,7 @@ double time_analytic_team(int ncells, int num_basis, int num_points, int ndim, #define INST_FUNC_N_DEV(N,DEV) \ INST_FUNC_FAD_N_DEV(SFadType,N,DEV) \ + INST_FUNC_FAD_N_DEV(SLFadType,N,DEV) \ INST_FUNC_FAD_N_DEV(DFadType,N,DEV) \ template double time_analytic_flat< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ template double time_analytic_const< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ diff --git a/packages/sacado/test/performance/advection/advection_hierarchical.cpp b/packages/sacado/test/performance/advection/advection_hierarchical.cpp index c5858b5299b9..680287ebc11f 100644 --- a/packages/sacado/test/performance/advection/advection_hierarchical.cpp +++ b/packages/sacado/test/performance/advection/advection_hierarchical.cpp @@ -114,12 +114,10 @@ void run_fad_hierarchical_team(const FluxView& flux, const WgbView& wgb, }); } -template +template double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check) { - typedef Sacado::Fad::SFad FadType; - static const int FadStride = is_cuda_space::value ? 32 : 1; #if defined(SACADO_ALIGN_SFAD) static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride; @@ -159,12 +157,10 @@ double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points, return time; } -template +template double time_fad_hierarchical_team(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check) { - typedef Sacado::Fad::SFad FadType; - static const int FadStride = is_cuda_space::value ? 32 : 1; #if defined(SACADO_ALIGN_SFAD) static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride; @@ -204,12 +200,13 @@ double time_fad_hierarchical_team(int ncells, int num_basis, int num_points, return time; } -#define INST_FUNC_N_DEV(N,DEV) \ - template double time_fad_hierarchical_flat< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ - template double time_fad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); +#define INST_FUNC_FAD_N_DEV(FAD,N,DEV) \ + template double time_fad_hierarchical_flat< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ + template double time_fad_hierarchical_team< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); #define INST_FUNC_DEV(DEV) \ - INST_FUNC_N_DEV( fad_dim, DEV ) + INST_FUNC_FAD_N_DEV( SFadType, fad_dim, DEV ) \ + INST_FUNC_FAD_N_DEV( SLFadType, fad_dim, DEV ) #ifdef KOKKOS_ENABLE_SERIAL INST_FUNC_DEV(Kokkos::Serial) diff --git a/packages/sacado/test/performance/advection/advection_hierarchical.hpp b/packages/sacado/test/performance/advection/advection_hierarchical.hpp index f78f4316897b..4d9eb17c556d 100644 --- a/packages/sacado/test/performance/advection/advection_hierarchical.hpp +++ b/packages/sacado/test/performance/advection/advection_hierarchical.hpp @@ -29,10 +29,10 @@ #pragma once -template +template double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); -template +template double time_fad_hierarchical_team(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); diff --git a/packages/sacado/test/performance/advection/common.hpp b/packages/sacado/test/performance/advection/common.hpp index 2a518a0b646d..557b4856db0d 100644 --- a/packages/sacado/test/performance/advection/common.hpp +++ b/packages/sacado/test/performance/advection/common.hpp @@ -31,6 +31,7 @@ const int fad_dim = 50; typedef Sacado::Fad::SFad SFadType; +typedef Sacado::Fad::SLFad SLFadType; typedef Sacado::Fad::DFad DFadType; template diff --git a/packages/sacado/test/performance/advection/driver.cpp b/packages/sacado/test/performance/advection/driver.cpp index 1812cc80ed0a..5918770d81cf 100644 --- a/packages/sacado/test/performance/advection/driver.cpp +++ b/packages/sacado/test/performance/advection/driver.cpp @@ -45,10 +45,12 @@ void run(const int cell_begin, const int cell_end, const int cell_step, const int nbasis, const int npoint, const int ntrial, const bool check) { const int ndim = 3; - printf("ncell %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", "flat sfad", "flat dfad", "dfad sc", "analytic", "const", "team", "hier sfad", "hier dfad", "h dfad sc"); + printf("ncell %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", "flat sfad", "flat slfad", "flat dfad", "dfad sc", "analytic", "const", "team", "hier sfad", "hier slfad", "hier dfad", "h dfad sc"); for(int i=cell_begin; i<=cell_end; i+=cell_step) { double sfad_flat = time_fad_flat( i,nbasis,npoint,ndim,ntrial,check); + double slfad_flat = time_fad_flat( + i,nbasis,npoint,ndim,ntrial,check); double dfad_flat = time_fad_flat( i,nbasis,npoint,ndim,ntrial,check); double dfad_scratch = time_fad_scratch( @@ -59,14 +61,16 @@ void run(const int cell_begin, const int cell_end, const int cell_step, i,nbasis,npoint,ndim,ntrial,check); double analytic_team = time_analytic_team( i,nbasis,npoint,ndim,ntrial,check); - double sfad_hierarchical = time_fad_hierarchical_team( + double sfad_hierarchical = time_fad_hierarchical_team( + i,nbasis,npoint,ndim,ntrial,check); + double slfad_hierarchical = time_fad_hierarchical_team( i,nbasis,npoint,ndim,ntrial,check); double dfad_hierarchical = time_dfad_hierarchical_team( i,nbasis,npoint,ndim,ntrial,check); double dfad_hierarchical_scratch = time_dfad_hierarchical_team_scratch( i,nbasis,npoint,ndim,ntrial,check); - printf("%5d %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e\n",i,sfad_flat,dfad_flat,dfad_scratch,analytic,analytic_const,analytic_team,sfad_hierarchical,dfad_hierarchical,dfad_hierarchical_scratch); + printf("%5d %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e\n",i,sfad_flat,slfad_flat,dfad_flat,dfad_scratch,analytic,analytic_const,analytic_team,sfad_hierarchical,slfad_hierarchical,dfad_hierarchical,dfad_hierarchical_scratch); } } diff --git a/packages/sacado/test/performance/advection_const_basis/advection.cpp b/packages/sacado/test/performance/advection_const_basis/advection.cpp index fb95df285c5c..caf421105bb3 100644 --- a/packages/sacado/test/performance/advection_const_basis/advection.cpp +++ b/packages/sacado/test/performance/advection_const_basis/advection.cpp @@ -402,6 +402,7 @@ double time_analytic_team(int ncells, int num_basis, int num_points, int ndim, #define INST_FUNC_N_DEV(N,DEV) \ INST_FUNC_FAD_N_DEV(SFadType,N,DEV) \ + INST_FUNC_FAD_N_DEV(SLFadType,N,DEV) \ INST_FUNC_FAD_N_DEV(DFadType,N,DEV) \ template double time_analytic_flat< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ template double time_analytic_const< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ diff --git a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp index e66de586f231..c22cb121bc6f 100644 --- a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp +++ b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp @@ -114,12 +114,10 @@ void run_fad_hierarchical_team(const FluxView& flux, const WgbView& wgb, }); } -template +template double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check) { - typedef Sacado::Fad::SFad FadType; - static const int FadStride = is_cuda_space::value ? 32 : 1; #if defined(SACADO_ALIGN_SFAD) static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride; @@ -160,12 +158,10 @@ double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points, return time; } -template +template double time_fad_hierarchical_team(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check) { - typedef Sacado::Fad::SFad FadType; - static const int FadStride = is_cuda_space::value ? 32 : 1; #if defined(SACADO_ALIGN_SFAD) static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride; @@ -206,12 +202,13 @@ double time_fad_hierarchical_team(int ncells, int num_basis, int num_points, return time; } -#define INST_FUNC_N_DEV(N,DEV) \ - template double time_fad_hierarchical_flat< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ - template double time_fad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); +#define INST_FUNC_FAD_N_DEV(FAD,N,DEV) \ + template double time_fad_hierarchical_flat< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ + template double time_fad_hierarchical_team< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); #define INST_FUNC_DEV(DEV) \ - INST_FUNC_N_DEV( fad_dim, DEV ) + INST_FUNC_FAD_N_DEV( SFadType, fad_dim, DEV ) \ + INST_FUNC_FAD_N_DEV( SLFadType, fad_dim, DEV ) #ifdef KOKKOS_ENABLE_SERIAL INST_FUNC_DEV(Kokkos::Serial) diff --git a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.hpp b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.hpp index f78f4316897b..4d9eb17c556d 100644 --- a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.hpp +++ b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.hpp @@ -29,10 +29,10 @@ #pragma once -template +template double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); -template +template double time_fad_hierarchical_team(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); diff --git a/packages/sacado/test/performance/advection_const_basis/common.hpp b/packages/sacado/test/performance/advection_const_basis/common.hpp index a7616179f733..14daf2e24553 100644 --- a/packages/sacado/test/performance/advection_const_basis/common.hpp +++ b/packages/sacado/test/performance/advection_const_basis/common.hpp @@ -31,6 +31,7 @@ const int fad_dim = 50; typedef Sacado::Fad::SFad SFadType; +typedef Sacado::Fad::SLFad SLFadType; typedef Sacado::Fad::DFad DFadType; template diff --git a/packages/sacado/test/performance/advection_const_basis/driver.cpp b/packages/sacado/test/performance/advection_const_basis/driver.cpp index 1812cc80ed0a..5918770d81cf 100644 --- a/packages/sacado/test/performance/advection_const_basis/driver.cpp +++ b/packages/sacado/test/performance/advection_const_basis/driver.cpp @@ -45,10 +45,12 @@ void run(const int cell_begin, const int cell_end, const int cell_step, const int nbasis, const int npoint, const int ntrial, const bool check) { const int ndim = 3; - printf("ncell %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", "flat sfad", "flat dfad", "dfad sc", "analytic", "const", "team", "hier sfad", "hier dfad", "h dfad sc"); + printf("ncell %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", "flat sfad", "flat slfad", "flat dfad", "dfad sc", "analytic", "const", "team", "hier sfad", "hier slfad", "hier dfad", "h dfad sc"); for(int i=cell_begin; i<=cell_end; i+=cell_step) { double sfad_flat = time_fad_flat( i,nbasis,npoint,ndim,ntrial,check); + double slfad_flat = time_fad_flat( + i,nbasis,npoint,ndim,ntrial,check); double dfad_flat = time_fad_flat( i,nbasis,npoint,ndim,ntrial,check); double dfad_scratch = time_fad_scratch( @@ -59,14 +61,16 @@ void run(const int cell_begin, const int cell_end, const int cell_step, i,nbasis,npoint,ndim,ntrial,check); double analytic_team = time_analytic_team( i,nbasis,npoint,ndim,ntrial,check); - double sfad_hierarchical = time_fad_hierarchical_team( + double sfad_hierarchical = time_fad_hierarchical_team( + i,nbasis,npoint,ndim,ntrial,check); + double slfad_hierarchical = time_fad_hierarchical_team( i,nbasis,npoint,ndim,ntrial,check); double dfad_hierarchical = time_dfad_hierarchical_team( i,nbasis,npoint,ndim,ntrial,check); double dfad_hierarchical_scratch = time_dfad_hierarchical_team_scratch( i,nbasis,npoint,ndim,ntrial,check); - printf("%5d %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e\n",i,sfad_flat,dfad_flat,dfad_scratch,analytic,analytic_const,analytic_team,sfad_hierarchical,dfad_hierarchical,dfad_hierarchical_scratch); + printf("%5d %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e\n",i,sfad_flat,slfad_flat,dfad_flat,dfad_scratch,analytic,analytic_const,analytic_team,sfad_hierarchical,slfad_hierarchical,dfad_hierarchical,dfad_hierarchical_scratch); } } diff --git a/packages/sacado/test/performance/mat_vec/common.hpp b/packages/sacado/test/performance/mat_vec/common.hpp index 246efd24e5d1..4e157717f8d5 100644 --- a/packages/sacado/test/performance/mat_vec/common.hpp +++ b/packages/sacado/test/performance/mat_vec/common.hpp @@ -39,7 +39,7 @@ struct Perf { double throughput; }; -const int SFadSize = 32; +const int SFadSize = 8; const int SLFadSize = SFadSize; const int HierSFadSize = 32; const int HierSLFadSize = HierSFadSize; diff --git a/packages/sacado/test/performance/mat_vec/mat_vec.cpp b/packages/sacado/test/performance/mat_vec/mat_vec.cpp index 57153a0a9765..f17740c46ef3 100644 --- a/packages/sacado/test/performance/mat_vec/mat_vec.cpp +++ b/packages/sacado/test/performance/mat_vec/mat_vec.cpp @@ -27,6 +27,8 @@ // *********************************************************************** // @HEADER +//#define SACADO_DISABLE_FAD_VIEW_SPEC + #include "Sacado.hpp" #include "mat_vec.hpp" @@ -292,9 +294,15 @@ do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, } #endif +#ifndef SACADO_DISABLE_FAD_VIEW_SPEC ViewTypeA A("A",m,n,p+1); ViewTypeB b("B",n,p+1); ViewTypeC c("c",m,p+1); +#else + ViewTypeA A("A",m,n); + ViewTypeB b("B",n); + ViewTypeC c("c",m); +#endif // FadType a(p, 1.0); // for (size_t k=0; k ViewTypeC; typedef typename ViewTypeA::execution_space execution_space; +#ifndef SACADO_DISABLE_FAD_VIEW_SPEC ViewTypeA A("A",m,n,p+1); ViewTypeB b("B",n,p+1); ViewTypeC c("c",m,p+1); +#else + ViewTypeA A("A",m,n); + ViewTypeB b("B",n); + ViewTypeC c("c",m); +#endif // FadType a(p, 1.0); // for (size_t k=0; k( A, b, c ); execution_space().fence(); - Teuchos::Time timer("mult", false); - timer.start(true); for (size_t l=0; l( A, b, c ); } execution_space().fence(); - timer.stop(); perf.time = wall_clock.seconds() / nloop; perf.flops = m*n*(2+4*p); @@ -484,13 +496,10 @@ do_time_analytic_s(const size_t m, const size_t n, run_mat_vec_deriv_s

( A, b, c ); execution_space().fence(); - Teuchos::Time timer("mult", false); - timer.start(true); for (size_t l=0; l( A, b, c ); } execution_space().fence(); - timer.stop(); perf.time = wall_clock.seconds() / nloop; perf.flops = m*n*(2+4*p); From e25b55b044cd78732e99d833c8f759eeb7559037 Mon Sep 17 00:00:00 2001 From: Eric Phipps Date: Wed, 3 Jun 2020 15:16:47 -0600 Subject: [PATCH 2/5] Remove old performance tests. --- .../sacado/test/performance/CMakeLists.txt | 28 - .../performance/fad_kokkos_hierarchical.cpp | 1024 ----------------- .../performance/fad_kokkos_mat_vec_perf.cpp | 689 ----------- .../test/performance/fad_kokkos_view.cpp | 804 ------------- 4 files changed, 2545 deletions(-) delete mode 100644 packages/sacado/test/performance/fad_kokkos_hierarchical.cpp delete mode 100644 packages/sacado/test/performance/fad_kokkos_mat_vec_perf.cpp delete mode 100644 packages/sacado/test/performance/fad_kokkos_view.cpp diff --git a/packages/sacado/test/performance/CMakeLists.txt b/packages/sacado/test/performance/CMakeLists.txt index f5611b27e290..e9512c980080 100644 --- a/packages/sacado/test/performance/CMakeLists.txt +++ b/packages/sacado/test/performance/CMakeLists.txt @@ -74,34 +74,6 @@ IF (${PACKAGE_NAME}_ENABLE_TeuchosCore AND NOT Kokkos_ENABLE_CUDA) ENDIF() -IF (Sacado_ENABLE_TeuchosCore AND Sacado_ENABLE_KokkosCore) - - # Disable these tests as they have been replaced by mat_vec, advection* below - - # TRIBITS_ADD_EXECUTABLE( - # fad_kokkos_view - # SOURCES fad_kokkos_view.cpp - # COMM serial mpi - # ) - - # # These tests do not compile with gcc 4.7.x because it doesn't properly - # # support lambdas. See github issue #854 - # IF(NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.8"))) - # TRIBITS_ADD_EXECUTABLE( - # fad_kokkos_hierarchical - # SOURCES fad_kokkos_hierarchical.cpp - # COMM serial mpi - # ) - - # TRIBITS_ADD_EXECUTABLE( - # fad_kokkos_mat_vec_perf - # SOURCES fad_kokkos_mat_vec_perf.cpp - # COMM serial mpi - # ) - # ENDIF() - -ENDIF() - ADD_SUBDIRECTORY(fenl_assembly) ADD_SUBDIRECTORY(fenl_assembly_view) ADD_SUBDIRECTORY(mat_vec) diff --git a/packages/sacado/test/performance/fad_kokkos_hierarchical.cpp b/packages/sacado/test/performance/fad_kokkos_hierarchical.cpp deleted file mode 100644 index 0b28e0111bdf..000000000000 --- a/packages/sacado/test/performance/fad_kokkos_hierarchical.cpp +++ /dev/null @@ -1,1024 +0,0 @@ -//#define SACADO_VIEW_CUDA_HIERARCHICAL 1 -//#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD_STRIDED 1 -#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1 -#define SACADO_KOKKOS_USE_MEMORY_POOL 1 -#define SACADO_ALIGN_SFAD 1 - -#include "Sacado.hpp" - -#include "Teuchos_CommandLineProcessor.hpp" -#include "Teuchos_StandardCatchMacros.hpp" - -#include "Kokkos_Core.hpp" -#include "Kokkos_MemoryPool.hpp" -#include "impl/Kokkos_Timer.hpp" -#include -#include - -// Advection kernel. - -template -struct AdvectionKernel; - -template -KOKKOS_INLINE_FUNCTION -AdvectionKernel -create_advection_kernel(const FluxView& flux, const WgbView& bg, - const SrcView& src, const WbsView& bs, - const ResidualView& residual, - const typename FluxView::non_const_value_type& coeff); - -#if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) - -// Version of advection kernel that supports flat and hierarchical parallelism -// when using the hierarchical_dfad approach. This requires no changes to the -// kernel beyond supporting a team policy. -template -struct AdvectionKernel { - typedef typename FluxView::non_const_value_type scalar_type; - typedef typename FluxView::execution_space execution_space; - typedef typename Kokkos::TeamPolicy::member_type team_handle; - - const FluxView flux_m_i; - const WgbView wgb; - const SrcView src_m_i; - const WbsView wbs; - const ResidualView residual_m_i; - const scalar_type coeff; - const size_t ncells; - const int num_basis, num_points, num_dim; - - // VS isn't used in this kernel - template struct HierarchicalFlatTag {}; - template struct HierarchicalTeamTag {}; - - KOKKOS_INLINE_FUNCTION - AdvectionKernel(const FluxView& flux, const WgbView& gb, - const SrcView& src, const WbsView& bs, - const ResidualView& residual, const scalar_type& c) : - flux_m_i(flux), - wgb(gb), - src_m_i(src), - wbs(bs), - residual_m_i(residual), - coeff(c), - ncells(flux_m_i.extent(0)), - num_basis(wgb.extent(1)), - num_points(wgb.extent(2)), - num_dim((wgb.extent(3))) - { - } - - KOKKOS_INLINE_FUNCTION - size_t num_cells() const { return ncells; } - - KOKKOS_INLINE_FUNCTION - void operator() (const size_t cell, const int basis) const { - scalar_type value(0),value2(0); - for (int qp=0; qp - KOKKOS_INLINE_FUNCTION - void operator() (const HierarchicalFlatTag, const team_handle& team) const { - const size_t cell = team.league_rank()*team.team_size() + team.team_rank(); - if (cell < ncells) - (*this)(cell); - } - - template - KOKKOS_INLINE_FUNCTION - void operator() (const HierarchicalTeamTag, const team_handle& team) const { - const size_t cell = team.league_rank(); - const int team_size = team.team_size(); - for (int basis=team.team_rank(); basis -struct AdvectionKernel { - typedef typename FluxView::non_const_value_type scalar_type; - typedef typename Kokkos::ThreadLocalScalarType::type local_scalar_type; - typedef typename FluxView::execution_space execution_space; - typedef typename Kokkos::TeamPolicy::member_type team_handle; - enum { stride = Kokkos::ViewScalarStride::stride }; - - const FluxView flux_m_i; - const WgbView wgb; - const SrcView src_m_i; - const WbsView wbs; - const ResidualView residual_m_i; - const scalar_type coeff; - const size_t ncells; - const int num_basis, num_points, num_dim; - - // VS isn't used in this kernel - template struct HierarchicalFlatTag {}; - template struct HierarchicalTeamTag {}; - - KOKKOS_INLINE_FUNCTION - AdvectionKernel(const FluxView& flux, const WgbView& gb, - const SrcView& src, const WbsView& bs, - const ResidualView& residual, const scalar_type& c) : - flux_m_i(flux), - wgb(gb), - src_m_i(src), - wbs(bs), - residual_m_i(residual), - coeff(c), - ncells(flux_m_i.extent(0)), - num_basis(wgb.extent(1)), - num_points(wgb.extent(2)), - num_dim((wgb.extent(3))) - { - } - - KOKKOS_INLINE_FUNCTION - size_t num_cells() const { return ncells; } - - KOKKOS_INLINE_FUNCTION - void operator() (const size_t cell, const int basis) const { - local_scalar_type value(0),value2(0); - local_scalar_type c = Sacado::partition_scalar(coeff); - for (int qp=0; qp - KOKKOS_INLINE_FUNCTION - void operator() (const HierarchicalFlatTag, const team_handle& team) const { - const size_t cell = team.league_rank()*team.team_size() + team.team_rank(); - if (cell < ncells) - (*this)(cell); - } - - template - KOKKOS_INLINE_FUNCTION - void operator() (const HierarchicalTeamTag, const team_handle& team) const { - const size_t cell = team.league_rank(); - const int team_size = team.team_size(); - for (int basis=team.team_rank(); basis -struct AdvectionKernel { - typedef typename FluxView::non_const_value_type scalar_type; - typedef typename FluxView::execution_space execution_space; - typedef typename Kokkos::TeamPolicy::member_type team_handle; - - const FluxView flux_m_i; - const WgbView wgb; - const SrcView src_m_i; - const WbsView wbs; - const ResidualView residual_m_i; - const scalar_type coeff; - const size_t ncells; - const int num_basis, num_points, num_dim; - - template struct HierarchicalFlatTag {}; - template struct HierarchicalTeamTag {}; - template struct PartitionedTag {}; - - KOKKOS_INLINE_FUNCTION - AdvectionKernel(const FluxView& flux, const WgbView& gb, - const SrcView& src, const WbsView& bs, - const ResidualView& residual, const scalar_type& c) : - flux_m_i(flux), - wgb(gb), - src_m_i(src), - wbs(bs), - residual_m_i(residual), - coeff(c), - ncells(flux_m_i.extent(0)), - num_basis(wgb.extent(1)), - num_points(wgb.extent(2)), - num_dim((wgb.extent(3))) - { - } - - KOKKOS_INLINE_FUNCTION - size_t num_cells() const { return ncells; } - - KOKKOS_INLINE_FUNCTION - void operator() (const size_t cell, const int basis) const { - scalar_type value(0),value2(0); - for (int qp=0; qp - KOKKOS_INLINE_FUNCTION - void operator() (const PartitionedTag, const size_t cell, const int basis) const { - // VS is the "vector" size == blockDim.x for CUDA. This is also set in - // the team execution policy, but we don't seem to have a way to access it. - // We also don't have a way to access the vector lane index from the team - // handle. -#ifdef __CUDA_ARCH__ - const unsigned k = threadIdx.x; -#else - const unsigned k = 0; -#endif - - // Partition each view based on Cuda thread (vector) index - auto flux_part = Kokkos::partition(flux_m_i, k, VS); - auto wgb_part = Kokkos::partition(wgb, k, VS); - auto src_part = Kokkos::partition(src_m_i, k, VS); - auto wbs_part = Kokkos::partition(wbs, k, VS); - auto resid_part = Kokkos::partition(residual_m_i, k, VS); - auto coeff_part = Sacado::partition_scalar(coeff); - - // Now run the kernel with thread-local view's - auto kernel_part = create_advection_kernel(flux_part, wgb_part, src_part, - wbs_part, resid_part, - coeff_part); - kernel_part(cell, basis); - } - - template - KOKKOS_INLINE_FUNCTION - void operator() (const HierarchicalFlatTag, const team_handle& team) const { - const size_t cell = team.league_rank()*team.team_size() + team.team_rank(); - if (cell < ncells) - for (int basis=0; basis(), cell, basis); - } - - template - KOKKOS_INLINE_FUNCTION - void operator() (const HierarchicalTeamTag, const team_handle& team) const { - const size_t cell = team.league_rank(); - const int team_size = team.team_size(); - for (int basis=team.team_rank(); basis(), cell, basis); - } -}; - -#endif - -template -KOKKOS_INLINE_FUNCTION -AdvectionKernel -create_advection_kernel(const FluxView& flux, const WgbView& bg, - const SrcView& src, const WbsView& bs, - const ResidualView& residual, - const typename FluxView::non_const_value_type& coeff) -{ - typedef AdvectionKernel kernel_type; - return kernel_type(flux,bg,src,bs,residual,coeff); -} - -template -void run_flat(const KernelType& kernel) { - typedef typename KernelType::execution_space execution_space; - Kokkos::RangePolicy policy(0,kernel.num_cells()); - Kokkos::parallel_for(policy, kernel); -} - -template -void run_hierarchical_flat(const KernelType& kernel) { - typedef typename KernelType::execution_space execution_space; -#if defined (KOKKOS_ENABLE_CUDA) - const bool is_cuda = std::is_same::value; -#else - const bool is_cuda = false; -#endif - const unsigned vector_size = is_cuda ? 32 : 1; - if (is_cuda) { - const unsigned team_size = 256 / vector_size; - typedef typename KernelType::template HierarchicalFlatTag tag_type; - typedef Kokkos::TeamPolicy policy_type; - const size_t range = (kernel.num_cells()+team_size-1)/team_size; - policy_type policy(range,team_size,vector_size); - Kokkos::parallel_for(policy, kernel); - } - else { - run_flat(kernel); - } -} - -template -void run_hierarchical_team(const KernelType& kernel) { - typedef typename KernelType::execution_space execution_space; -#if defined (KOKKOS_ENABLE_CUDA) - const bool is_cuda = std::is_same::value; -#else - const bool is_cuda = false; -#endif - const unsigned vector_size = is_cuda ? 32 : 1; - if (is_cuda) { - const unsigned team_size = 256 / vector_size; - typedef typename KernelType::template HierarchicalTeamTag tag_type; - typedef Kokkos::TeamPolicy policy_type; - policy_type policy(kernel.num_cells(),team_size,vector_size); - Kokkos::parallel_for(policy, kernel); - } - else { - run_flat(kernel); - } -} - -template struct FadTypeName; -template struct FadTypeName< Sacado::Fad::DFad > { - static std::string eval() { - return std::string("dfad") -#if defined(SACADO_KOKKOS_USE_MEMORY_POOL) - + std::string(", mempool") -#endif -#if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD_STRIDED) - + std::string(", strided") -#endif - ; - } -}; -template struct FadTypeName< Sacado::Fad::SFad > { - static std::string eval() { -#if defined(SACADO_ALIGN_SFAD) - return "sfad, aligned"; -#else - return "sfad"; -#endif - } -}; -template struct FadTypeName< Sacado::Fad::SLFad > { - static std::string eval() { return "slfad"; } -}; - -template -struct DrekarTest { - - int ncells; - int num_basis; - int num_points; - int ndim; - - struct MomFluxTag {}; - struct MomFluxTagConst {}; - struct MomFluxTagConstTeam {}; - - typedef Kokkos::View t_4DView; - typedef Kokkos::View t_3DView; - typedef Kokkos::View t_2DView; - - typedef Kokkos::View > t_3DView_const; - -#if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) - typedef Sacado::Fad::DFad FadType; // Must be DFad in this case -#else - typedef Sacado::Fad::SFad FadType; -#endif - typedef Kokkos::View t_4DViewFad; - typedef Kokkos::View t_3DViewFad; - typedef Kokkos::View t_2DViewFad; - - typedef typename ExecSpace::array_layout DefaultLayout; -#if defined(KOKKOS_ENABLE_CUDA) - static const int FadStride = - std::is_same< ExecSpace, Kokkos::Cuda >::value ? 32 : 1; -#if defined(SACADO_ALIGN_SFAD) - static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride; - typedef typename FadType::template apply_N::type AlignedFadType; -#else - typedef FadType AlignedFadType; -#endif -#else - static const int FadStride = 1; - typedef FadType AlignedFadType; -#endif - typedef Kokkos::LayoutContiguous ContLayout; - typedef Kokkos::View t_4DViewFadCont; - typedef Kokkos::View t_3DViewFadCont; - typedef Kokkos::View t_2DViewFadCont; - - typedef typename Kokkos::TeamPolicy::member_type team_handle; - - typedef Kokkos::View t_4DView_team; - typedef Kokkos::View t_3DView_team; - typedef Kokkos::View t_2DView_team; - typedef Kokkos::View > t_3DView_const_team; - - - typedef Kokkos::View > t_shared_scalar; - - t_4DViewFad wgb_fad; - t_3DViewFad flux_m_i_fad,wbs_fad; - t_2DViewFad src_m_i_fad,residual_m_i_fad; - - t_4DViewFadCont wgb_fad_cont; - t_3DViewFadCont flux_m_i_fad_cont,wbs_fad_cont; - t_2DViewFadCont src_m_i_fad_cont,residual_m_i_fad_cont; - - t_4DView wgb; - t_3DView flux_m_i,wbs; - t_3DView_const flux_m_i_const; - t_2DView src_m_i,residual_m_i,residual_m_i_const; - - t_4DView_team twgb; - t_3DView_team tflux_m_i,twbs; - t_3DView_const_team tflux_m_i_const; - t_2DView_team tsrc_m_i,tresidual_m_i; - - AlignedFadType coeff; - - DrekarTest(int ncells_, int num_basis_, int num_points_): - ncells(ncells_) , - num_basis(num_basis_) , - num_points(num_points_) , - ndim(DIM), - coeff(N, 0.0) - { - wgb_fad = t_4DViewFad("",ncells,num_basis,num_points,ndim,N+1); - wbs_fad = t_3DViewFad("",ncells,num_basis,num_points,N+1); - flux_m_i_fad = t_3DViewFad("",ncells,num_points,ndim,N+1); - src_m_i_fad = t_2DViewFad("",ncells,num_points,N+1); - residual_m_i_fad = t_2DViewFad("",ncells,num_basis,N+1); - init_fad(wgb_fad, wbs_fad, flux_m_i_fad, src_m_i_fad, residual_m_i_fad); - - wgb_fad_cont = t_4DViewFadCont("",ncells,num_basis,num_points,ndim,N+1); - wbs_fad_cont = t_3DViewFadCont("",ncells,num_basis,num_points,N+1); - flux_m_i_fad_cont = t_3DViewFadCont("",ncells,num_points,ndim,N+1); - src_m_i_fad_cont = t_2DViewFadCont("",ncells,num_points,N+1); - residual_m_i_fad_cont = t_2DViewFadCont("",ncells,num_basis,N+1); - init_fad(wgb_fad_cont, wbs_fad_cont, flux_m_i_fad_cont, src_m_i_fad_cont, residual_m_i_fad_cont); - - wgb = t_4DView("",ncells,num_basis,num_points,ndim); - wbs = t_3DView("",ncells,num_basis,num_points); - flux_m_i_const = flux_m_i = t_3DView("",ncells,num_points,ndim); - src_m_i = t_2DView("",ncells,num_points); - residual_m_i = t_2DView("",ncells,num_basis); - init_array(wgb, wbs, flux_m_i, src_m_i, residual_m_i); - - residual_m_i_const = t_2DView("",ncells,num_basis); - Kokkos::deep_copy( residual_m_i_const, 0.0 ); - - twgb = t_4DView_team("",ncells,num_basis,num_points,ndim); - twbs = t_3DView_team("",ncells,num_basis,num_points); - tflux_m_i_const = tflux_m_i = t_3DView_team("",ncells,num_points,ndim); - tsrc_m_i = t_2DView_team("",ncells,num_points); - tresidual_m_i = t_2DView_team("",ncells,num_basis); - init_array(twgb, twbs, tflux_m_i, tsrc_m_i, tresidual_m_i); - - for (int i=0; i - void init_fad(const V1& v1, const V2& v2, const V3& v3, const V4& v4, const V5& v5) - { - // Kokkos::deep_copy(typename V1::array_type(v1), 1.0); - // Kokkos::deep_copy(typename V2::array_type(v2), 2.0); - // Kokkos::deep_copy(typename V3::array_type(v3), 3.0); - // Kokkos::deep_copy(typename V4::array_type(v4), 4.0); - - auto v1_h = Kokkos::create_mirror_view(v1); - auto v2_h = Kokkos::create_mirror_view(v2); - auto v3_h = Kokkos::create_mirror_view(v3); - auto v4_h = Kokkos::create_mirror_view(v4); - for (int cell=0; cell - void init_array(const V1& v1, const V2& v2, const V3& v3, const V4& v4, const V5& v5) - { - // Kokkos::deep_copy(typename V1::array_type(v1), 1.0); - // Kokkos::deep_copy(typename V2::array_type(v2), 2.0); - // Kokkos::deep_copy(typename V3::array_type(v3), 3.0); - // Kokkos::deep_copy(typename V4::array_type(v4), 4.0); - - auto v1_h = Kokkos::create_mirror_view(v1); - auto v2_h = Kokkos::create_mirror_view(v2); - auto v3_h = Kokkos::create_mirror_view(v3); - auto v4_h = Kokkos::create_mirror_view(v4); - for (int cell=0; cell - typename std::enable_if< !Kokkos::is_view_fad::value, bool>::type - check(const View1& v_gold, const View2& v, const double tol) - { - // Copy to host - typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold); - typename View2::HostMirror v_h = Kokkos::create_mirror_view(v); - Kokkos::deep_copy(v_gold_h, v_gold); - Kokkos::deep_copy(v_h, v); - - typedef typename View1::value_type value_type; - - const size_t n0 = v_gold_h.extent(0); - const size_t n1 = v_gold_h.extent(1); - const size_t n2 = v_gold_h.extent(2); - - bool success = true; - for ( size_t i0 = 0 ; i0 < n0 ; ++i0 ) { - for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { - for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { - value_type x_gold = v_gold_h(i0,i1,i2); - value_type x = v_h(i0,i1,i2); - if (std::abs(x_gold-x) > tol*std::abs(x_gold)) { - std::cout << "Comparison failed! x_gold(" - << i0 << "," << i1 << "," << i2 << ") = " - << x_gold << " , x = " << x - << std::endl; - success = false; - } - } - } - } - - return success; - } - - template - typename std::enable_if< Kokkos::is_view_fad::value, bool>::type - check(const View1& v_gold, const View2& v, const double tol) - { - // Copy to host - typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold); - typename View2::HostMirror v_h = Kokkos::create_mirror_view(v); - Kokkos::deep_copy(v_gold_h, v_gold); - Kokkos::deep_copy(v_h, v); - - typedef typename View1::value_type value_type; - - const size_t n0 = v_gold_h.extent(0); - const size_t n1 = v_gold_h.extent(1); - const size_t n2 = v_gold_h.extent(2); - - bool success = true; - for ( size_t i0 = 0 ; i0 < n0 ; ++i0 ) { - for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { - for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { - value_type x_gold = v_gold_h(i0,i1,i2); - value_type x = (i2 == n2-1) ? v_h(i0,i1).val() : v_h(i0,i1).dx(i2); - if (std::abs(x_gold-x) > tol*std::abs(x_gold)) { - std::cout << "Comparison failed! x_gold(" - << i0 << "," << i1 << "," << i2 << ") = " - << x_gold << " , x = " << x - << std::endl; - success = false; - } - } - } - } - - return success; - } - - KOKKOS_INLINE_FUNCTION - void operator() (const MomFluxTag, const std::size_t &cell) const { - for (int basis=0; basis(0,ncells), *this); - Kokkos::fence(); - double time = timer.seconds() / ntrial / ncells; - - timer.reset(); - for (int i=0; i(0,ncells), *this); - Kokkos::fence(); - double time_const = timer.seconds() / ntrial / ncells; - - timer.reset(); - for (int i=0; i(ncells,num_basis,32).set_scratch_size(0,Kokkos::PerThread(64*8*2)), *this); - Kokkos::fence(); - double time_team = timer.seconds() / ntrial / ncells; - - printf("%5d %9.3e %9.3e %9.3e %9.3e %9.3e\n",ncells,time_fad,time_fad_cont,time,time_const,time_team); - - if (do_check) { - const double tol = 1e-14; - check(residual_m_i, residual_m_i_fad, tol); - check(residual_m_i, residual_m_i_fad_cont, tol); - check(residual_m_i, residual_m_i_const, tol); - check(residual_m_i, tresidual_m_i, tol); - } - } -}; - -template -void run(const int cell_begin, const int cell_end, const int cell_step, - const int nbasis, const int npoint, const int ntrial, const bool check) -{ - const int fad_dim = 50; - const int dim = 3; - typedef DrekarTest test_type; - - // The kernel allocates 2*N double's per warp on Cuda. Approximate - // the maximum number of warps as the maximum concurrency / 32. - // Include a fudge factor of 1.2 since memory pool treats a block as full - // once it reaches 80% capacity - std::cout << "concurrency = " << ExecSpace::concurrency() << std::endl; - const size_t block_size = fad_dim*sizeof(double); - size_t nkernels = ExecSpace::concurrency()*2; -#if defined(KOKKOS_ENABLE_CUDA) - if (std::is_same::value) - nkernels /= 32; -#endif - size_t mem_pool_size = - static_cast(1.2*nkernels*block_size); - const size_t superblock_size = std::max(nkernels / 100, 1) * block_size; - std::cout << "Memory pool size = " << mem_pool_size / (1024.0 * 1024.0) - << " MB" << std::endl; - ExecSpace exec_space; - Sacado::createGlobalMemoryPool(exec_space, mem_pool_size, - block_size, block_size, superblock_size); - -#if defined(SACADO_VIEW_CUDA_HIERARCHICAL) - std::cout << "hierarchical"; -#elif defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) - std::cout << "hierarchical_dfad"; -#else - std::cout << "partitioned"; -#endif - std::cout << ", " << FadTypeName::eval() - << ":" << std::endl; - - printf("ncell flat hier analytic const team\n"); - for(int i=cell_begin; i<=cell_end; i+=cell_step) { - test_type test(i,nbasis,npoint); - test.compute(ntrial, check); - } - - Sacado::destroyGlobalMemoryPool(exec_space); -} - -int main(int argc, char* argv[]) { - bool success = true; - try { - - // Set up command line options - Teuchos::CommandLineProcessor clp(false); - clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel"); -#ifdef KOKKOS_ENABLE_SERIAL - bool serial = 0; - clp.setOption("serial", "no-serial", &serial, "Whether to run Serial"); -#endif -#ifdef KOKKOS_ENABLE_OPENMP - int openmp = 0; - clp.setOption("openmp", &openmp, "Number of OpenMP threads"); -#endif -#ifdef KOKKOS_ENABLE_THREADS - int threads = 0; - clp.setOption("threads", &threads, "Number of pThreads threads"); -#endif -#ifdef KOKKOS_ENABLE_CUDA - bool cuda = 0; - clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA"); -#endif - int numa = 0; - clp.setOption("numa", &numa, - "Number of NUMA domains to use (set to 0 to use all NUMAs"); - int cores_per_numa = 0; - clp.setOption("cores-per-numa", &cores_per_numa, - "Number of CPU cores per NUMA to use (set to 0 to use all cores)"); - bool print_config = false; - clp.setOption("print-config", "no-print-config", &print_config, - "Whether to print Kokkos device configuration"); - int cell_begin = 100; - clp.setOption("begin", &cell_begin, "Starting number of cells"); - int cell_end = 8000; - clp.setOption("end", &cell_end, "Ending number of cells"); - int cell_step = 100; - clp.setOption("step", &cell_step, "Cell increment"); - int nbasis = 8; - clp.setOption("basis", &nbasis, "Number of basis functions"); - int npoint = 8; - clp.setOption("point", &npoint, "Number of integration points"); - int ntrial = 5; - clp.setOption("trial", &ntrial, "Number of trials"); - bool check = false; - clp.setOption("check", "no-check", &check, - "Check correctness of results"); - - // Parse options - switch (clp.parse(argc, argv)) { - case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED: - return 0; - case Teuchos::CommandLineProcessor::PARSE_ERROR: - case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: - return 1; - case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL: - break; - } - - Kokkos::InitArguments init_args; - init_args.num_threads = -1; - #ifdef KOKKOS_ENABLE_OPENMP - if(openmp) init_args.num_threads = openmp; - #endif - #ifdef KOKKOS_ENABLE_THREADS - if(threads) init_args.num_threads = threads; - #endif - - Kokkos::initialize(init_args); - if (print_config) - Kokkos::print_configuration(std::cout, true); - -#ifdef KOKKOS_ENABLE_SERIAL - if (serial) { - using Kokkos::Serial; - run(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check); - } -#endif - -#ifdef KOKKOS_ENABLE_OPENMP - if (openmp) { - using Kokkos::OpenMP; - run(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check); - } -#endif - -#ifdef KOKKOS_ENABLE_THREADS - if (threads) { - using Kokkos::Threads; - run(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check); - } -#endif - -#ifdef KOKKOS_ENABLE_CUDA - if (cuda) { - using Kokkos::Cuda; - run(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check); - } -#endif - Kokkos::finalize(); - } - TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success); - - return !success; -} diff --git a/packages/sacado/test/performance/fad_kokkos_mat_vec_perf.cpp b/packages/sacado/test/performance/fad_kokkos_mat_vec_perf.cpp deleted file mode 100644 index 7e413cf17f5c..000000000000 --- a/packages/sacado/test/performance/fad_kokkos_mat_vec_perf.cpp +++ /dev/null @@ -1,689 +0,0 @@ -// @HEADER -// *********************************************************************** -// -// Sacado Package -// Copyright (2006) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// This library is free software; you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as -// published by the Free Software Foundation; either version 2.1 of the -// License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, but -// WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 -// USA -// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps -// (etphipp@sandia.gov). -// -// *********************************************************************** -// @HEADER - -//#define SACADO_VIEW_CUDA_HIERARCHICAL 1 -#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1 -#define SACADO_KOKKOS_USE_MEMORY_POOL 1 -//#define SACADO_ALIGN_SFAD 1 - -//#define SACADO_DISABLE_FAD_VIEW_SPEC -#include "Sacado.hpp" - -#include "Teuchos_CommandLineProcessor.hpp" -#include "Teuchos_StandardCatchMacros.hpp" -#include "Teuchos_Time.hpp" - -#include "impl/Kokkos_Timer.hpp" - -// For vtune -#include -#include -#include - -// A performance test that computes the derivative of a simple Kokkos kernel -// using various Fad classes - -template -void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) { - typedef typename ViewTypeC::value_type scalar_type; - typedef typename ViewTypeC::execution_space execution_space; - - const int m = A.extent(0); - const int n = A.extent(1); - Kokkos::parallel_for( - Kokkos::RangePolicy( 0,m ), - KOKKOS_LAMBDA (const int i) { - scalar_type t = 0.0; - for (int j=0; j -void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b, - const ViewTypeC& c) { - typedef typename ViewTypeC::value_type scalar_type; - typedef typename ViewTypeC::execution_space execution_space; - -#if defined (KOKKOS_ENABLE_CUDA) - const bool is_cuda = std::is_same::value; -#else - const bool is_cuda = false; -#endif - const unsigned vector_size = is_cuda ? 32 : 1; - const unsigned team_size = is_cuda ? 128 / vector_size : 1; - - const int m = A.extent(0); - const int n = A.extent(1); - const int range = (m+team_size-1)/team_size; - - typedef Kokkos::TeamPolicy Policy; - Kokkos::parallel_for( - Policy( range,team_size,vector_size ), - KOKKOS_LAMBDA (const typename Policy::member_type& team) { - const int i = team.league_rank()*team.team_size() + team.team_rank(); - if (i >= m) - return; - - scalar_type t = 0.0; - for (int j=0; j -void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b, - const ViewTypeC& c) { - typedef typename Kokkos::ThreadLocalScalarType::type scalar_type; - typedef typename ViewTypeC::execution_space execution_space; - -#if defined (KOKKOS_ENABLE_CUDA) - const bool is_cuda = std::is_same::value; -#else - const bool is_cuda = false; -#endif - const unsigned vector_size = is_cuda ? 32 : 1; - const unsigned team_size = is_cuda ? 128 / vector_size : 1; - - const int m = A.extent(0); - const int n = A.extent(1); - const int range = (m+team_size-1)/team_size; - - typedef Kokkos::TeamPolicy Policy; - Kokkos::parallel_for( - Policy( range,team_size,vector_size ), - KOKKOS_LAMBDA (const typename Policy::member_type& team) { - const int i = team.league_rank()*team.team_size() + team.team_rank(); - if (i >= m) - return; - - scalar_type t = 0.0; - for (int j=0; j -void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b, - const ViewTypeC& c) { - typedef typename ViewTypeC::value_type scalar_type; - typedef typename ViewTypeC::execution_space execution_space; - -#if defined (KOKKOS_ENABLE_CUDA) - const bool is_cuda = std::is_same::value; -#else - const bool is_cuda = false; -#endif - const unsigned vector_size = 1; - const unsigned team_size = is_cuda ? 128 / vector_size : 1; - - const int m = A.extent(0); - const int n = A.extent(1); - const int range = (m+team_size-1)/team_size; - - typedef Kokkos::TeamPolicy Policy; - Kokkos::parallel_for( - Policy( range,team_size,vector_size ), - KOKKOS_LAMBDA (const typename Policy::member_type& team) { - const int i = team.league_rank()*team.team_size() + team.team_rank(); - if (i >= m) - return; - - scalar_type t = 0.0; - for (int j=0; j -void -check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) -{ - const double tol = 1.0e-14; - typedef typename ViewTypeC::value_type value_type; - typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c); - Kokkos::deep_copy(h_c, c); - const size_t m = A.extent(0); - const size_t n = A.extent(1); - for (size_t i=0; i tol) { - std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t - << std::endl; - } - } -} - -template -void -check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) -{ - const double tol = 1.0e-14; - typedef typename ViewTypeC::value_type value_type; - typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c); - Kokkos::deep_copy(h_c, c); - const size_t m = A.extent(0); - const size_t n = A.extent(1); - const size_t p = Kokkos::dimension_scalar(A); - for (size_t i=0; i tol) { - std::cout << "Comparison failed! " << i << "," << j << " : " - << h_c(i).fastAccessDx(j) << " , " << t << std::endl; - } - } - } -} - -struct Perf { - double time; - double flops; - double throughput; -}; - -template -Perf -do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, - const bool check) -{ - typedef Kokkos::View ViewTypeA; - typedef Kokkos::View ViewTypeB; - typedef Kokkos::View ViewTypeC; - typedef typename ViewTypeA::execution_space execution_space; - - ViewTypeA A("A",m,n,p+1); - ViewTypeB b("B",n,p+1); - ViewTypeC c("c",m,p+1); - - FadType a(p, 1.0); - for (size_t k=0; k -Perf -do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, - const size_t nloop, const bool check) -{ - typedef Kokkos::View ViewTypeA; - typedef Kokkos::View ViewTypeB; - typedef Kokkos::View ViewTypeC; - typedef typename ViewTypeA::execution_space execution_space; - -#if defined (KOKKOS_ENABLE_CUDA) - const bool is_cuda = std::is_same::value; -#else - const bool is_cuda = false; -#endif - -#if defined(SACADO_VIEW_CUDA_HIERARCHICAL) - const int FadStride = is_cuda ? 32 : 1; -#if defined(SACADO_ALIGN_SFAD) - const int N = Sacado::StaticSize::value; - const int Nalign = ((N+FadStride-1)/FadStride)*FadStride; - const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p; - typedef typename FadType::template apply_N::type AlignedFadType; -#else - typedef FadType AlignedFadType; - const size_t pa = p; -#endif -#else - const int FadStride = 1; - typedef FadType AlignedFadType; - const size_t pa = p; -#endif - -#if defined(SACADO_VIEW_CUDA_HIERARCHICAL) || defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) - typedef Kokkos::LayoutContiguous ConLayoutA; - typedef Kokkos::LayoutContiguous ConLayoutB; - typedef Kokkos::LayoutContiguous ConLayoutC; -#else - typedef typename ViewTypeA::array_layout ConLayoutA; - typedef typename ViewTypeB::array_layout ConLayoutB; - typedef typename ViewTypeC::array_layout ConLayoutC; - (void) FadStride; -#endif - - - typedef Kokkos::View ConViewTypeA; - typedef Kokkos::View ConViewTypeB; - typedef Kokkos::View ConViewTypeC; - - ConViewTypeA A("A",m,n,pa+1); - ConViewTypeB b("B",n,pa+1); - ConViewTypeC c("c",m,pa+1); - - AlignedFadType a(pa, 1.0); - for (size_t k=0; k(1.2*nkernels*block_size); - const size_t superblock_size = std::max(nkernels / 100, 1) * block_size; - execution_space space; - Sacado::createGlobalMemoryPool(space, mem_pool_size, - block_size, - block_size, - superblock_size - ); -#endif - - // Execute the kernel once to warm up - run_mat_vec_hierarchical( A, b, c ); - execution_space().fence(); - - wall_clock.reset(); - for (size_t l=0; l -Perf -do_time_val(const size_t m, const size_t n, const size_t nloop, - const bool check) -{ - typedef Kokkos::View ViewTypeA; - typedef Kokkos::View ViewTypeB; - typedef Kokkos::View ViewTypeC; - typedef typename ViewTypeA::execution_space execution_space; - - ViewTypeA A("A",m,n); - ViewTypeB b("B",n); - ViewTypeC c("c",m); - - Kokkos::deep_copy(A, 1.0); - Kokkos::deep_copy(b, 1.0); - - Kokkos::Impl::Timer wall_clock; - Perf perf; - - // Execute the kernel once to warm up - run_mat_vec( A, b, c ); - execution_space().fence(); - - wall_clock.reset(); - for (size_t l=0; l -void -do_times(const size_t m, - const size_t n, - const size_t p, - const size_t ph, - const size_t nloop, - const bool value, - const bool sfad, - const bool slfad, - const bool dfad, - const bool hierarchical, - const bool check) -{ - Perf perf_value; - perf_value.time = 1.0; - - // Run value - if (value) { - Perf perf = do_time_val(m,n,nloop,check); - perf_value = perf; - print_perf(perf, perf_value, p, "Value "); - } - - // Run SFad - if (sfad && p == SFadSize) { - Perf perf = - do_time_fad, ViewArgs...>(m,n,p,nloop,check); - print_perf(perf, perf_value, p, "SFad "); - } - - // Run SLFad - if (slfad && p <= SLFadSize) { - Perf perf = - do_time_fad, ViewArgs...>(m,n,p,nloop,check); - print_perf(perf, perf_value, p, "SLFad "); - } - - // Run DFad - if (dfad) { - Perf perf = - do_time_fad, ViewArgs...>(m,n,p,nloop,check); - print_perf(perf, perf_value, p, "DFad "); - } - - // Run hierarchical - if (hierarchical) { - if (sfad && ph == HierSFadSize) { - Perf perf = - do_time_fad_hierarchical, ViewArgs...>(m,n,ph,nloop,check); - print_perf(perf, perf_value, ph, "Hier SFad "); - } - if (slfad && ph <= HierSLFadSize) { - Perf perf = - do_time_fad_hierarchical, ViewArgs...>(m,n,ph,nloop,check); - print_perf(perf, perf_value, ph, "Hier SLFad"); - } - if (dfad) { - Perf perf = - do_time_fad_hierarchical, ViewArgs...>(m,n,ph,nloop,check); - print_perf(perf, perf_value, ph, "Hier DFad "); - } - } - -} - -enum LayoutType { - LAYOUT_LEFT=0, - LAYOUT_RIGHT, - LAYOUT_DEFAULT -}; -const int num_layout_types = 3; -const LayoutType layout_values[] = { - LAYOUT_LEFT, LAYOUT_RIGHT, LAYOUT_DEFAULT }; -const char *layout_names[] = { "left", "right", "default" }; - -template -void -do_times_layout(const size_t m, - const size_t n, - const size_t p, - const size_t ph, - const size_t nloop, - const bool value, - const bool sfad, - const bool slfad, - const bool dfad, - const bool hierarchical, - const bool check, - const LayoutType& layout, - const std::string& device) -{ - int prec = 2; - std::cout.setf(std::ios::scientific); - std::cout.precision(prec); - std::cout << std::endl - << device - << " performance for layout " - << layout_names[layout] - << " m = " << m << " n = " << n << " p = " << p << " ph = " << ph - << std::endl << std::endl; - std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl; - - if (layout == LAYOUT_LEFT) - do_times( - m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check); - else if (layout == LAYOUT_RIGHT) - do_times( - m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check); - else - do_times - (m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check); -} - -// Connect executable to vtune for profiling -void connect_vtune() { - std::stringstream cmd; - pid_t my_os_pid=getpid(); - const std::string vtune_loc = - "amplxe-cl"; - const std::string output_dir = "./vtune"; - cmd << vtune_loc - << " -collect hotspots -result-dir " << output_dir - << " -target-pid " << my_os_pid << " &"; - std::cout << cmd.str() << std::endl; - system(cmd.str().c_str()); - system("sleep 10"); -} - -const int SFadSize = 8; -const int SLFadSize = SFadSize; -const int HierSFadSize = 50; -const int HierSLFadSize = HierSFadSize; - -int main(int argc, char* argv[]) { - bool success = true; - try { - - // Set up command line options - Teuchos::CommandLineProcessor clp(false); - clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel"); - int m = 100000; - clp.setOption("m", &m, "Number of matrix rows"); - int n = 100; - clp.setOption("n", &n, "Number of matrix columns"); - int p = SFadSize; - clp.setOption("p", &p, "Number of derivative components"); - int ph = HierSFadSize; - clp.setOption("ph", &ph, "Number of derivative components for hierarchical"); - int nloop = 10; - clp.setOption("nloop", &nloop, "Number of loops"); -#ifdef KOKKOS_ENABLE_SERIAL - bool serial = 0; - clp.setOption("serial", "no-serial", &serial, "Whether to run Serial"); -#endif -#ifdef KOKKOS_ENABLE_OPENMP - int openmp = 0; - clp.setOption("openmp", &openmp, "Number of OpenMP threads"); -#endif -#ifdef KOKKOS_ENABLE_THREADS - int threads = 0; - clp.setOption("threads", &threads, "Number of pThreads threads"); -#endif -#ifdef KOKKOS_ENABLE_CUDA - bool cuda = 0; - clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA"); -#endif - int numa = 0; - clp.setOption("numa", &numa, - "Number of NUMA domains to use (set to 0 to use all NUMAs"); - int cores_per_numa = 0; - clp.setOption("cores-per-numa", &cores_per_numa, - "Number of CPU cores per NUMA to use (set to 0 to use all cores)"); - bool print_config = false; - clp.setOption("print-config", "no-print-config", &print_config, - "Whether to print Kokkos device configuration"); - LayoutType layout = LAYOUT_DEFAULT; - clp.setOption("layout", &layout, num_layout_types, layout_values, - layout_names, "View layout"); - bool vtune = false; - clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune"); - bool value = true; - clp.setOption("value", "no-value", &value, "Run value calculation"); - bool sfad = true; - clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation"); - bool slfad = true; - clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation"); - bool dfad = true; - clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation"); - bool hierarchical = true; - clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation"); - bool check = false; - clp.setOption("check", "no-check", &check, "Check calculations are correct"); - - // Parse options - switch (clp.parse(argc, argv)) { - case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED: - return 0; - case Teuchos::CommandLineProcessor::PARSE_ERROR: - case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: - return 1; - case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL: - break; - } - - if (vtune) - connect_vtune(); - -// #ifdef KOKKOS_ENABLE_SERIAL -// if (serial) { -// Kokkos::initialize(); -// if (print_config) -// Kokkos::print_configuration(std::cout, true); -// do_times_layout( -// m,n,p,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Serial"); -// Kokkos::finalize(); -// } -// #endif - -#ifdef KOKKOS_ENABLE_OPENMP - if (openmp) { - Kokkos::InitArgs init_args; - init_args.num_threads = openmp; - init_args.num_numa = numa; - Kokkos::initialize( init_args ); - if (print_config) - Kokkos::print_configuration(std::cout, true); - do_times_layout( - m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"OpenMP"); - Kokkos::finalize(); - } -#endif - -#ifdef KOKKOS_ENABLE_THREADS - if (threads) { - Kokkos::InitArgs init_args; - init_args.num_threads = threads; - init_args.num_numa = numa; - Kokkos::initialize( init_args ); - if (print_config) - Kokkos::print_configuration(std::cout, true); - do_times_layout( - m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Threads"); - Kokkos::finalize(); - } -#endif - -#ifdef KOKKOS_ENABLE_CUDA - if (cuda) { - Kokkos::initialize(); - if (print_config) - Kokkos::print_configuration(std::cout, true); - do_times_layout( - m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Cuda"); - Kokkos::finalize(); - } -#endif - - } - TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success); - - return !success; -} diff --git a/packages/sacado/test/performance/fad_kokkos_view.cpp b/packages/sacado/test/performance/fad_kokkos_view.cpp deleted file mode 100644 index d214cf434a2d..000000000000 --- a/packages/sacado/test/performance/fad_kokkos_view.cpp +++ /dev/null @@ -1,804 +0,0 @@ -// @HEADER -// *********************************************************************** -// -// Sacado Package -// Copyright (2006) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// This library is free software; you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as -// published by the Free Software Foundation; either version 2.1 of the -// License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, but -// WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 -// USA -// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps -// (etphipp@sandia.gov). -// -// *********************************************************************** -// @HEADER - -//#define SACADO_DISABLE_FAD_VIEW_SPEC -#include "Sacado.hpp" - -#include "Teuchos_CommandLineProcessor.hpp" -#include "Teuchos_StandardCatchMacros.hpp" -#include "Teuchos_Time.hpp" - -#include "impl/Kokkos_Timer.hpp" - -// For vtune -#include -#include - -// A performance test that computes the derivative of a simple Kokkos kernel -// using various Fad classes - -// Our Kokkos kernel: computes c = A*b for A mxn and b nx1 -template -struct MatVecFunctor { - - // The scalar type used in this calculation, e.g., double - typedef typename ViewTypeC::value_type scalar_type; - - // The best ordinal type for the architecture we are running on, - // e.g., int or size_t - //typedef typename ViewTypeC::size_type size_type; - typedef int size_type; - - // The execution space where this functor will run - typedef typename ViewTypeC::execution_space execution_space; - - // Data needed by functor - const ViewTypeA A; - const ViewTypeB b; - const ViewTypeC c; - const size_type n; - - // Constructor - MatVecFunctor(const ViewTypeA& A_arg, - const ViewTypeB& b_arg, - const ViewTypeC& c_arg) : - A(A_arg), b(b_arg), c(c_arg), n(A.extent(1)) - {} - - // Function to compute matrix-vector product for a given row i - KOKKOS_INLINE_FUNCTION - void operator() (const size_type i) const - { - scalar_type t = 0.0; - for (size_type j=0; j -struct MatVecDerivFunctor { - - // The scalar type used in this calculation, e.g., double - typedef typename ViewTypeC::value_type scalar_type; - - // The best ordinal type for the architecture we are running on, - // e.g., int or size_t - //typedef typename ViewTypeC::size_type size_type; - typedef int size_type; - - // The execution space where this functor will run - typedef typename ViewTypeC::execution_space execution_space; - - // Data needed by functor - const ViewTypeA A; - const ViewTypeB b; - const ViewTypeC c; - const size_type n; - const size_type p; - - // Constructor - MatVecDerivFunctor(const ViewTypeA& A_arg, - const ViewTypeB& b_arg, - const ViewTypeC& c_arg) : - A(A_arg), b(b_arg), c(c_arg), n(A.extent(1)), p(A.extent(2)-1) - {} - - KOKKOS_INLINE_FUNCTION - void operator() (const size_type i) const - { - c(i,p) = 0.0; - for (size_type k=0; k -struct SLMatVecDerivFunctor { - - // The scalar type used in this calculation, e.g., double - typedef typename ViewTypeC::value_type scalar_type; - - // The best ordinal type for the architecture we are running on, - // e.g., int or size_t - //typedef typename ViewTypeC::size_type size_type; - typedef int size_type; - - // The execution space where this functor will run - typedef typename ViewTypeC::execution_space execution_space; - - // Data needed by functor - const ViewTypeA A; - const ViewTypeB b; - const ViewTypeC c; - const size_type n; - const size_type p; - - // Constructor - SLMatVecDerivFunctor(const ViewTypeA& A_arg, - const ViewTypeB& b_arg, - const ViewTypeC& c_arg) : - A(A_arg), b(b_arg), c(c_arg), n(A.extent(1)), p(A.extent(2)-1) - {} - - KOKKOS_INLINE_FUNCTION - void operator() (const size_type i) const - { - scalar_type cv = 0.0; - scalar_type t[MaxP]; - for (size_type k=0; k -struct SMatVecDerivFunctor { - - // The scalar type used in this calculation, e.g., double - typedef typename ViewTypeC::value_type scalar_type; - - // The best ordinal type for the architecture we are running on, - // e.g., int or size_t - //typedef typename ViewTypeC::size_type size_type; - typedef int size_type; - - // The execution space where this functor will run - typedef typename ViewTypeC::execution_space execution_space; - - // Data needed by functor - const ViewTypeA A; - const ViewTypeB b; - const ViewTypeC c; - const size_type n; - - // Constructor - SMatVecDerivFunctor(const ViewTypeA& A_arg, - const ViewTypeB& b_arg, - const ViewTypeC& c_arg) : - A(A_arg), b(b_arg), c(c_arg), n(A.extent(1)) - {} - - KOKKOS_INLINE_FUNCTION - void operator() (const size_type i) const - { - scalar_type cv = 0.0; - scalar_type t[p]; - for (size_type k=0; k -void -run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) -{ - MatVecFunctor f( A, b, c ); - Kokkos::parallel_for( A.extent(0), f ); -} - -// Create a mat-vec derivative functor from given A, b, c -template -void -run_mat_vec_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) -{ - MatVecDerivFunctor f( A, b, c ); - Kokkos::parallel_for( A.extent(0), f ); -} - -// Create a mat-vec derivative functor from given A, b, c -template -void -run_mat_vec_deriv_sl(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) -{ - SLMatVecDerivFunctor f( A, b, c ); - Kokkos::parallel_for( A.extent(0), f ); -} - -// Create a mat-vec derivative functor from given A, b, c -template -void -run_mat_vec_deriv_s(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) -{ - SMatVecDerivFunctor f( A, b, c ); - Kokkos::parallel_for( A.extent(0), f ); -} - -template -void -check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) -{ - const double tol = 1.0e-14; - typedef typename ViewTypeC::value_type value_type; - typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c); - Kokkos::deep_copy(h_c, c); - const size_t m = A.extent(0); - const size_t n = A.extent(1); - for (size_t i=0; i tol) { - std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t - << std::endl; - } - } -} - -template -void -check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) -{ - const double tol = 1.0e-14; - typedef typename ViewTypeC::value_type value_type; - typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c); - Kokkos::deep_copy(h_c, c); - const size_t m = A.extent(0); - const size_t n = A.extent(1); - const size_t p = A.extent(2); - for (size_t i=0; i tol) { - std::cout << "Comparison failed! " << i << "," << j << " : " - << h_c(i,j) << " , " << t << std::endl; - } - } - } -} - -struct Perf { - double time; - double flops; - double throughput; -}; - -template -Perf -do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, - const bool check) -{ - typedef Kokkos::View ViewTypeA; - typedef Kokkos::View ViewTypeB; - typedef Kokkos::View ViewTypeC; - typedef typename ViewTypeA::execution_space execution_space; - - ViewTypeA A("A",m,n,p+1); - ViewTypeB b("B",n,p+1); - ViewTypeC c("c",m,p+1); - - FadType a(p, 1.0); - for (size_t k=0; k -Perf -do_time_analytic(const size_t m, const size_t n, const size_t p, - const size_t nloop, const bool check) -{ - typedef Kokkos::View ViewTypeA; - typedef Kokkos::View ViewTypeB; - typedef Kokkos::View ViewTypeC; - typedef typename ViewTypeA::execution_space execution_space; - - ViewTypeA A("A",m,n,p+1); - ViewTypeB b("B",n,p+1); - ViewTypeC c("c",m,p+1); - - Kokkos::deep_copy(A, 1.0); - Kokkos::deep_copy(b, 1.0); - - Kokkos::Impl::Timer wall_clock; - Perf perf; - - // Execute the kernel once to warm up - run_mat_vec_deriv( A, b, c ); - execution_space().fence(); - - Teuchos::Time timer("mult", false); - timer.start(true); - for (size_t l=0; l -Perf -do_time_analytic_sl(const size_t m, const size_t n, const size_t p, - const size_t nloop, const bool check) -{ - typedef Kokkos::View ViewTypeA; - typedef Kokkos::View ViewTypeB; - typedef Kokkos::View ViewTypeC; - typedef typename ViewTypeA::execution_space execution_space; - - ViewTypeA A("A",m,n,p+1); - ViewTypeB b("B",n,p+1); - ViewTypeC c("c",m,p+1); - - Kokkos::deep_copy(A, 1.0); - Kokkos::deep_copy(b, 1.0); - - Kokkos::Impl::Timer wall_clock; - Perf perf; - - // Execute the kernel once to warm up - run_mat_vec_deriv_sl( A, b, c ); - execution_space().fence(); - - Teuchos::Time timer("mult", false); - timer.start(true); - for (size_t l=0; l( A, b, c ); - } - execution_space().fence(); - timer.stop(); - - perf.time = wall_clock.seconds() / nloop; - perf.flops = m*n*(2+4*p); - perf.throughput = perf.flops / perf.time / 1.0e9; - - if (check) - check_deriv(A,b,c); - - return perf; -} - -template -Perf -do_time_analytic_s(const size_t m, const size_t n, - const size_t nloop, const bool check) -{ - typedef Kokkos::View ViewTypeA; - typedef Kokkos::View ViewTypeB; - typedef Kokkos::View ViewTypeC; - typedef typename ViewTypeA::execution_space execution_space; - - ViewTypeA A("A",m,n,p+1); - ViewTypeB b("B",n,p+1); - ViewTypeC c("c",m,p+1); - - Kokkos::deep_copy(A, 1.0); - Kokkos::deep_copy(b, 1.0); - - Kokkos::Impl::Timer wall_clock; - Perf perf; - - // Execute the kernel once to warm up - run_mat_vec_deriv_s

( A, b, c ); - execution_space().fence(); - - Teuchos::Time timer("mult", false); - timer.start(true); - for (size_t l=0; l( A, b, c ); - } - execution_space().fence(); - timer.stop(); - - perf.time = wall_clock.seconds() / nloop; - perf.flops = m*n*(2+4*p); - perf.throughput = perf.flops / perf.time / 1.0e9; - - if (check) - check_deriv(A,b,c); - - return perf; -} - -template -Perf -do_time_val(const size_t m, const size_t n, const size_t nloop, - const bool check) -{ - typedef Kokkos::View ViewTypeA; - typedef Kokkos::View ViewTypeB; - typedef Kokkos::View ViewTypeC; - typedef typename ViewTypeA::execution_space execution_space; - - ViewTypeA A("A",m,n); - ViewTypeB b("B",n); - ViewTypeC c("c",m); - - Kokkos::deep_copy(A, 1.0); - Kokkos::deep_copy(b, 1.0); - - Kokkos::Impl::Timer wall_clock; - Perf perf; - - // Execute the kernel once to warm up - run_mat_vec( A, b, c ); - execution_space().fence(); - - wall_clock.reset(); - for (size_t l=0; l -void -do_times(const size_t m, - const size_t n, - const size_t p, - const size_t nloop, - const bool value, - const bool analytic, - const bool sfad, - const bool slfad, - const bool dfad, - const bool check) -{ - Perf perf_analytic; - perf_analytic.time = 1.0; - - // Run analytic - if (analytic) { - perf_analytic = do_time_analytic(m,n,p,nloop,check); - } - - // Run value - if (value) { - Perf perf = do_time_val(m,n,nloop,check); - print_perf(perf, perf_analytic, "Value "); - } - - if (analytic) { - print_perf(perf_analytic, perf_analytic, "Analytic "); - } - - if(analytic && p == SFadSize) { - Perf perf = - do_time_analytic_s(m,n,nloop,check); - print_perf(perf, perf_analytic, "Analytic-s"); - } - - if(analytic && p <= SLFadSize) { - Perf perf = - do_time_analytic_sl(m,n,p,nloop,check); - print_perf(perf, perf_analytic, "Analytic-sl"); - } - - // Run SFad - if (sfad && p == SFadSize) { - Perf perf = - do_time_fad, ViewArgs...>(m,n,p,nloop,check); - print_perf(perf, perf_analytic, "SFad "); - } - - // Run SLFad - if (slfad && p <= SLFadSize) { - Perf perf = - do_time_fad, ViewArgs...>(m,n,p,nloop,check); - print_perf(perf, perf_analytic, "SLFad "); - } - - // Run DFad - if (dfad) { - Perf perf = - do_time_fad, ViewArgs...>(m,n,p,nloop,check); - print_perf(perf, perf_analytic, "DFad "); - } - -} - -enum LayoutType { - LAYOUT_LEFT=0, - LAYOUT_RIGHT, - LAYOUT_DEFAULT -}; -const int num_layout_types = 3; -const LayoutType layout_values[] = { - LAYOUT_LEFT, LAYOUT_RIGHT, LAYOUT_DEFAULT }; -const char *layout_names[] = { "left", "right", "default" }; - -template -void -do_times_layout(const size_t m, - const size_t n, - const size_t p, - const size_t nloop, - const bool value, - const bool analytic, - const bool sfad, - const bool slfad, - const bool dfad, - const bool check, - const LayoutType& layout, - const std::string& device) -{ - int prec = 2; - std::cout.setf(std::ios::scientific); - std::cout.precision(prec); - std::cout << std::endl - << device - << " performance for layout " - << layout_names[layout] - << " m = " << m << " n = " << n << " p = " << p - << std::endl << std::endl; - std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl; - - if (layout == LAYOUT_LEFT) - do_times( - m,n,p,nloop,value,analytic,sfad,slfad,dfad,check); - else if (layout == LAYOUT_RIGHT) - do_times( - m,n,p,nloop,value,analytic,sfad,slfad,dfad,check); - else - do_times - (m,n,p,nloop,value,analytic,sfad,slfad,dfad,check); -} - -// Connect executable to vtune for profiling -void connect_vtune() { - std::stringstream cmd; - pid_t my_os_pid=getpid(); - const std::string vtune_loc = - "amplxe-cl"; - const std::string output_dir = "./vtune"; - cmd << vtune_loc - << " -collect hotspots -result-dir " << output_dir - << " -target-pid " << my_os_pid << " &"; - std::cout << cmd.str() << std::endl; - system(cmd.str().c_str()); - system("sleep 10"); -} - -const int SFadSize = 8; -const int SLFadSize = SFadSize; - -int main(int argc, char* argv[]) { - bool success = true; - try { - - // Set up command line options - Teuchos::CommandLineProcessor clp(false); - clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel"); - int m = 100000; - clp.setOption("m", &m, "Number of matrix rows"); - int n = 100; - clp.setOption("n", &n, "Number of matrix columns"); - int p = SFadSize; - clp.setOption("p", &p, "Number of derivative components"); - int nloop = 10; - clp.setOption("nloop", &nloop, "Number of loops"); -#ifdef KOKKOS_ENABLE_SERIAL - bool serial = 0; - clp.setOption("serial", "no-serial", &serial, "Whether to run Serial"); -#endif -#ifdef KOKKOS_ENABLE_OPENMP - int openmp = 0; - clp.setOption("openmp", &openmp, "Number of OpenMP threads"); -#endif -#ifdef KOKKOS_ENABLE_THREADS - int threads = 0; - clp.setOption("threads", &threads, "Number of pThreads threads"); -#endif -#ifdef KOKKOS_ENABLE_CUDA - bool cuda = 0; - clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA"); -#endif - int numa = 0; - clp.setOption("numa", &numa, - "Number of NUMA domains to use (set to 0 to use all NUMAs"); - int cores_per_numa = 0; - clp.setOption("cores-per-numa", &cores_per_numa, - "Number of CPU cores per NUMA to use (set to 0 to use all cores)"); - bool print_config = false; - clp.setOption("print-config", "no-print-config", &print_config, - "Whether to print Kokkos device configuration"); - LayoutType layout = LAYOUT_DEFAULT; - clp.setOption("layout", &layout, num_layout_types, layout_values, - layout_names, "View layout"); - bool vtune = false; - clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune"); - bool value = true; - clp.setOption("value", "no-value", &value, "Run value calculation"); - bool analytic = true; - clp.setOption("analytic", "no-analytic", &analytic, - "Run analytic derivative calculation"); - bool sfad = true; - clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation"); - bool slfad = true; - clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation"); -#if defined(KOKKOS_ENABLE_CUDA_UVM) - bool dfad = true; - clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation"); -#else - bool dfad = false; -#endif - bool check = false; - clp.setOption("check", "no-check", &check, "Check calculations are correct"); - - // Parse options - switch (clp.parse(argc, argv)) { - case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED: - return 0; - case Teuchos::CommandLineProcessor::PARSE_ERROR: - case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: - return 1; - case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL: - break; - } - - if (vtune) - connect_vtune(); - - Kokkos::InitArguments init_args; - init_args.num_threads = cores_per_numa; - init_args.num_numa = numa; - - Kokkos::initialize(init_args); - - if (print_config) - Kokkos::print_configuration(std::cout, true); - -#ifdef KOKKOS_ENABLE_SERIAL - if (serial) { - do_times_layout( - m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,"Serial"); - } -#endif - -#ifdef KOKKOS_ENABLE_OPENMP - if (openmp) { - do_times_layout( - m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,"OpenMP"); - } -#endif - -#ifdef KOKKOS_ENABLE_THREADS - if (threads) { - do_times_layout( - m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,"Threads"); - } -#endif - -#ifdef KOKKOS_ENABLE_CUDA - if (cuda) { - do_times_layout( - m,n,p,nloop,value,analytic,sfad,slfad,dfad,check,layout,"Cuda"); - } -#endif - Kokkos::finalize(); - - } - TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success); - - return !success; -} From 03cfdb48ba8d6f73c8cd7a1546ca69d7f0c01c36 Mon Sep 17 00:00:00 2001 From: Eric Phipps Date: Wed, 3 Jun 2020 15:45:44 -0600 Subject: [PATCH 3/5] Sacado: Add configure option to force SFad to init in default constructor The new design SFad does not directly initialize its data in the default constructor, but instead uses the default compiler generated implementation. This speeds SFad up but has proved to be problematic in some codes that were assuming SFad always initialized (as was the case in the old design). This adds a configure option, Sacado_SFAD_INIT_DEFAULT_CONSTRUCTOR, to recover the old behavior. It defaults to off. Also add KOKKOS_DEFAULTED_FUNCTION in several places for Cuda 9 and lower. --- packages/sacado/CMakeLists.txt | 7 +++++++ packages/sacado/cmake/Sacado_config.h.in | 3 +++ packages/sacado/src/Sacado_ConfigDefs.h | 4 ++++ .../src/new_design/Sacado_Fad_Exp_GeneralFad.hpp | 6 ++++++ .../new_design/Sacado_Fad_Exp_StaticFixedStorage.hpp | 10 ++++++++++ 5 files changed, 30 insertions(+) diff --git a/packages/sacado/CMakeLists.txt b/packages/sacado/CMakeLists.txt index add024520a9a..3be83214791f 100644 --- a/packages/sacado/CMakeLists.txt +++ b/packages/sacado/CMakeLists.txt @@ -31,6 +31,13 @@ TRIBITS_ADD_OPTION_AND_DEFINE( ON ) +TRIBITS_ADD_OPTION_AND_DEFINE( + ${PACKAGE_NAME}_SFAD_INIT_DEFAULT_CONSTRUCTOR + SACADO_SFAD_INIT_DEFAULT_CONSTRUCTOR + "Force SFad (in the new design) to initialize value and derivative components in the default constructor (adds additional runtime cost, but protects against uninitialized use)." + OFF + ) + TRIBITS_ADD_OPTION_AND_DEFINE( ${PACKAGE_NAME}_ENABLE_HIERARCHICAL SACADO_VIEW_CUDA_HIERARCHICAL diff --git a/packages/sacado/cmake/Sacado_config.h.in b/packages/sacado/cmake/Sacado_config.h.in index 7b9a2ac92a91..c0d8fad8d92f 100644 --- a/packages/sacado/cmake/Sacado_config.h.in +++ b/packages/sacado/cmake/Sacado_config.h.in @@ -74,3 +74,6 @@ /* Define if want to make the new Fad design the default, replacing the old one */ #cmakedefine SACADO_NEW_FAD_DESIGN_IS_DEFAULT + +/* Force SFad (in the new design) to initialize value and derivative components in the default constructor (adds additional runtime cost, but protects against uninitialized use). */ +#cmakedefine SACADO_SFAD_INIT_DEFAULT_CONSTRUCTOR diff --git a/packages/sacado/src/Sacado_ConfigDefs.h b/packages/sacado/src/Sacado_ConfigDefs.h index dbbc9e97da26..44269bf193d4 100644 --- a/packages/sacado/src/Sacado_ConfigDefs.h +++ b/packages/sacado/src/Sacado_ConfigDefs.h @@ -86,6 +86,10 @@ Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps #define KOKKOS_FUNCTION /* */ #endif +#ifndef KOKKOS_DEFAULTED_FUNCTION +#define KOKKOS_DEFAULTED_FUNCTION /* */ +#endif + #ifndef KOKKOS_INLINE_FUNCTION #define KOKKOS_INLINE_FUNCTION inline #endif diff --git a/packages/sacado/src/new_design/Sacado_Fad_Exp_GeneralFad.hpp b/packages/sacado/src/new_design/Sacado_Fad_Exp_GeneralFad.hpp index 6dd1299abddf..d1232e3a7cce 100644 --- a/packages/sacado/src/new_design/Sacado_Fad_Exp_GeneralFad.hpp +++ b/packages/sacado/src/new_design/Sacado_Fad_Exp_GeneralFad.hpp @@ -94,12 +94,15 @@ namespace Sacado { using ExtenderType::ExtenderType; //! Default constructor + KOKKOS_DEFAULTED_FUNCTION GeneralFad() = default; //! Copy constructor + KOKKOS_DEFAULTED_FUNCTION GeneralFad(const GeneralFad& x) = default; //! Move constructor + KOKKOS_DEFAULTED_FUNCTION GeneralFad(GeneralFad&& x) = default; //! Constructor with value (disabled for ViewFad) @@ -118,6 +121,7 @@ namespace Sacado { } //! Destructor + KOKKOS_DEFAULTED_FUNCTION ~GeneralFad() = default; //! Set %GeneralFad object as the \c ith independent variable @@ -194,10 +198,12 @@ namespace Sacado { } //! Assignment with GeneralFad right-hand-side + KOKKOS_DEFAULTED_FUNCTION GeneralFad& operator=(const GeneralFad& x) = default; //! Move assignment with GeneralFad right-hand-side + KOKKOS_DEFAULTED_FUNCTION GeneralFad& operator=(GeneralFad&& x) = default; diff --git a/packages/sacado/src/new_design/Sacado_Fad_Exp_StaticFixedStorage.hpp b/packages/sacado/src/new_design/Sacado_Fad_Exp_StaticFixedStorage.hpp index a43425172233..86febf0c83e8 100644 --- a/packages/sacado/src/new_design/Sacado_Fad_Exp_StaticFixedStorage.hpp +++ b/packages/sacado/src/new_design/Sacado_Fad_Exp_StaticFixedStorage.hpp @@ -69,7 +69,16 @@ namespace Sacado { }; //! Default constructor +#ifdef SACADO_SFAD_INIT_DEFAULT_CONSTRUCTOR + KOKKOS_INLINE_FUNCTION + StaticFixedStorage() : + val_(T(0.0)) { + ss_array::zero(dx_, Num); + } +#else + KOKKOS_DEFAULTED_FUNCTION StaticFixedStorage() = default; +#endif //! Constructor with value KOKKOS_INLINE_FUNCTION @@ -128,6 +137,7 @@ namespace Sacado { } //! Destructor + KOKKOS_DEFAULTED_FUNCTION ~StaticFixedStorage() = default; //! Assignment From 598aecaeba9c513ea9a5261a4d5941ee747dfceb Mon Sep 17 00:00:00 2001 From: Eric Phipps Date: Wed, 3 Jun 2020 17:45:04 -0600 Subject: [PATCH 4/5] Sacado: Add Sacado::safe_sqrt(x) function for handling x == 0 The derivative of sqrt(x) is not defined for x == 0. Computing the derivative with x == 0 happens frequently in application codes, and the usual advice is to return 0 in this case, to avoid the NaN. It would be convenient to have a function that does this for you automatically, so I added a safe_sqrt() function that does just that. It includes a default implementation that just calls sqrt(), has overloads for all of the primary Fad types, and is SIMD-safe. --- packages/sacado/src/Sacado_CacheFad_Ops.hpp | 4 + .../sacado/src/Sacado_ELRCacheFad_Ops.hpp | 4 + packages/sacado/src/Sacado_ELRFad_Ops.hpp | 7 + packages/sacado/src/Sacado_Fad_Ops.hpp | 138 ++++++++++++++++ packages/sacado/src/Sacado_Fad_Ops_Fwd.hpp | 2 + packages/sacado/src/Sacado_MathFunctions.hpp | 29 ++++ packages/sacado/src/Sacado_cmath.hpp | 9 + .../Sacado_Fad_Exp_MathFunctions.hpp | 1 + .../src/new_design/Sacado_Fad_Exp_Ops.hpp | 156 ++++++++++++++++++ .../src/new_design/Sacado_Fad_Exp_Ops_Fwd.hpp | 2 + packages/sacado/test/UnitTests/CMakeLists.txt | 9 + .../sacado/test/UnitTests/SafeSqrtTests.cpp | 110 ++++++++++++ 12 files changed, 471 insertions(+) create mode 100644 packages/sacado/test/UnitTests/SafeSqrtTests.cpp diff --git a/packages/sacado/src/Sacado_CacheFad_Ops.hpp b/packages/sacado/src/Sacado_CacheFad_Ops.hpp index 1aefc1775fae..6fb46748f7b1 100644 --- a/packages/sacado/src/Sacado_CacheFad_Ops.hpp +++ b/packages/sacado/src/Sacado_CacheFad_Ops.hpp @@ -433,6 +433,10 @@ FAD_UNARYOP_MACRO(sqrt, SqrtOp, a = value_type(1)/(value_type(2)*std::sqrt(v)), std::sqrt(v)) +FAD_UNARYOP_MACRO(safe_sqrt, + SafeSqrtOp, + a = (v == value_type(0.0) ? value_type(0.0) : value_type(value_type(1)/(value_type(2)*std::sqrt(v)))), + std::sqrt(v)) FAD_UNARYOP_MACRO(cos, CosOp, a = -std::sin(v), diff --git a/packages/sacado/src/Sacado_ELRCacheFad_Ops.hpp b/packages/sacado/src/Sacado_ELRCacheFad_Ops.hpp index dd498af45654..67a37efac62e 100644 --- a/packages/sacado/src/Sacado_ELRCacheFad_Ops.hpp +++ b/packages/sacado/src/Sacado_ELRCacheFad_Ops.hpp @@ -597,6 +597,10 @@ FAD_UNARYOP_MACRO(sqrt, SqrtOp, a = scalar_type(1.0)/(scalar_type(2.0)*std::sqrt(v)), std::sqrt(v)) +FAD_UNARYOP_MACRO(safe_sqrt, + SafeSqrtOp, + a = (v == value_type(0.0) ? value_type(0.0) : value_type(scalar_type(1.0)/(scalar_type(2.0)*std::sqrt(v)))), + std::sqrt(v)) FAD_UNARYOP_MACRO(cos, CosOp, a = -std::sin(v), diff --git a/packages/sacado/src/Sacado_ELRFad_Ops.hpp b/packages/sacado/src/Sacado_ELRFad_Ops.hpp index 4f4cfaddf239..32042cb746b4 100644 --- a/packages/sacado/src/Sacado_ELRFad_Ops.hpp +++ b/packages/sacado/src/Sacado_ELRFad_Ops.hpp @@ -211,6 +211,13 @@ FAD_UNARYOP_MACRO(sqrt, false, expr.dx(i)/(value_type(2)* std::sqrt(expr.val())), expr.fastAccessDx(i)/(value_type(2)* std::sqrt(expr.val()))) +FAD_UNARYOP_MACRO(safe_sqrt, + SafeSqrtOp, + std::sqrt(expr.val()), + expr.val() == value_type(0.0) ? value_type(0.0) : value_type(value_type(0.5)*bar/std::sqrt(expr.val())), + false, + expr.val() == value_type(0.0) ? value_type(0.0) : value_type(expr.dx(i)/(value_type(2)*std::sqrt(expr.val()))), + expr.val() == value_type(0.0) ? value_type(0.0) : value_type(expr.fastAccessDx(i)/(value_type(2)*std::sqrt(expr.val())))) FAD_UNARYOP_MACRO(cos, CosOp, std::cos(expr.val()), diff --git a/packages/sacado/src/Sacado_Fad_Ops.hpp b/packages/sacado/src/Sacado_Fad_Ops.hpp index 9faaf058344a..8a34e94d6026 100644 --- a/packages/sacado/src/Sacado_Fad_Ops.hpp +++ b/packages/sacado/src/Sacado_Fad_Ops.hpp @@ -270,6 +270,144 @@ FAD_UNARYOP_MACRO(cbrt, #undef FAD_UNARYOP_MACRO +// Special handling for safe_sqrt() to provide specializations of SafeSqrtOp for +// "simd" value types that use if_then_else(). The only reason for not using +// if_then_else() always is to avoid evaluating the derivative if the value is +// zero to avoid throwing FPEs. +namespace Sacado { + namespace Fad { + + template + class SafeSqrtOp {}; + + template + struct ExprSpec< SafeSqrtOp > { + typedef typename ExprSpec::type type; + }; + + // + // Implementation for simd type using if_then_else() + // + template + class Expr< SafeSqrt,ExprSpecDefault,true > { + public: + + typedef typename ExprT::value_type value_type; + typedef typename ExprT::scalar_type scalar_type; + typedef typename ExprT::base_expr_type base_expr_type; + + KOKKOS_INLINE_FUNCTION + explicit Expr(const ExprT& expr_) : expr(expr_) {} + + KOKKOS_INLINE_FUNCTION + int size() const { return expr.size(); } + + KOKKOS_INLINE_FUNCTION + bool hasFastAccess() const { return expr.hasFastAccess(); } + + KOKKOS_INLINE_FUNCTION + bool isPassive() const { return expr.isPassive();} + + KOKKOS_INLINE_FUNCTION + bool updateValue() const { return expr.updateValue(); } + + KOKKOS_INLINE_FUNCTION + void cache() const {} + + KOKKOS_INLINE_FUNCTION + value_type val() const { + using std::sqrt; + return sqrt(expr.val()); + } + + KOKKOS_INLINE_FUNCTION + value_type dx(int i) const { + using std::sqrt; + return if_then_else( + expr.val() == value_type(0.0), value_type(0.0), + value_type(expr.dx(i)/(value_type(2)*sqrt(expr.val())))); + } + + KOKKOS_INLINE_FUNCTION + value_type fastAccessDx(int i) const { + using std::sqrt; + return if_then_else( + expr.val() == value_type(0.0), value_type(0.0), + value_type(expr.fastAccessDx(i)/(value_type(2)*sqrt(expr.val())))); + } + + protected: + + const ExprT& expr; + }; + + // + // Specialization for scalar types using ternary operator + // + template + class Expr< SafeSqrt,ExprSpecDefault,false > { + public: + + typedef typename ExprT::value_type value_type; + typedef typename ExprT::scalar_type scalar_type; + typedef typename ExprT::base_expr_type base_expr_type; + + KOKKOS_INLINE_FUNCTION + explicit Expr(const ExprT& expr_) : expr(expr_) {} + + KOKKOS_INLINE_FUNCTION + int size() const { return expr.size(); } + + KOKKOS_INLINE_FUNCTION + bool hasFastAccess() const { return expr.hasFastAccess(); } + + KOKKOS_INLINE_FUNCTION + bool isPassive() const { return expr.isPassive();} + + KOKKOS_INLINE_FUNCTION + bool updateValue() const { return expr.updateValue(); } + + KOKKOS_INLINE_FUNCTION + void cache() const {} + + KOKKOS_INLINE_FUNCTION + value_type val() const { + using std::sqrt; + return sqrt(expr.val()); + } + + KOKKOS_INLINE_FUNCTION + value_type dx(int i) const { + using std::sqrt; + return expr.val() == value_type(0.0) ? value_type(0.0) : + value_type(expr.dx(i)/(value_type(2)*sqrt(expr.val()))); + } + + KOKKOS_INLINE_FUNCTION + value_type fastAccessDx(int i) const { + using std::sqrt; + return expr.val() == value_type(0.0) ? value_type(0.0) : + value_type(expr.fastAccessDx(i)/(value_type(2)*sqrt(expr.val()))); + } + + protected: + + const ExprT& expr; + }; + + template + KOKKOS_INLINE_FUNCTION + Expr< SafeSqrtOp< Expr > > + safe_sqrt (const Expr& expr) + { + typedef OP< Expr > expr_t; + + return Expr(expr); + } + } + +} + #define FAD_BINARYOP_MACRO(OPNAME,OP,USING,VALUE,DX,FASTACCESSDX,VAL_CONST_DX_1,VAL_CONST_DX_2,CONST_DX_1,CONST_DX_2,CONST_FASTACCESSDX_1,CONST_FASTACCESSDX_2) \ namespace Sacado { \ namespace Fad { \ diff --git a/packages/sacado/src/Sacado_Fad_Ops_Fwd.hpp b/packages/sacado/src/Sacado_Fad_Ops_Fwd.hpp index 379d74d3b12c..c124dafc44aa 100644 --- a/packages/sacado/src/Sacado_Fad_Ops_Fwd.hpp +++ b/packages/sacado/src/Sacado_Fad_Ops_Fwd.hpp @@ -81,6 +81,8 @@ namespace Sacado { #ifdef HAVE_SACADO_CXX11 template class CbrtOp; #endif + template ::value> + class SafeSqrtOp; template class AdditionOp; template class SubtractionOp; diff --git a/packages/sacado/src/Sacado_MathFunctions.hpp b/packages/sacado/src/Sacado_MathFunctions.hpp index efc51b716b56..30b4b950fc1d 100644 --- a/packages/sacado/src/Sacado_MathFunctions.hpp +++ b/packages/sacado/src/Sacado_MathFunctions.hpp @@ -139,6 +139,35 @@ UNARYFUNC_MACRO(cbrt, CbrtOp) #undef UNARYFUNC_MACRO +namespace Sacado { + namespace Fad { + template + KOKKOS_INLINE_FUNCTION + Expr< SafeSqrtOp< Expr > > safe_sqrt (const Expr&); + } + + namespace ELRFad { + template class SafeSqrtOp; + template + KOKKOS_INLINE_FUNCTION + Expr< SafeSqrtOp< Expr > > safe_sqrt (const Expr&); + } + + namespace CacheFad { + template class SafeSqrtOp; + template + KOKKOS_INLINE_FUNCTION + Expr< SafeSqrtOp< Expr > > safe_sqrt (const Expr&); + } + + namespace ELRCacheFad { + template class SafeSqrtOp; + template + KOKKOS_INLINE_FUNCTION + Expr< SafeSqrtOp< Expr > > safe_sqrt (const Expr&); + } +} + #define BINARYFUNC_MACRO(OP,FADOP) \ namespace Sacado { \ \ diff --git a/packages/sacado/src/Sacado_cmath.hpp b/packages/sacado/src/Sacado_cmath.hpp index f9f1ccdd02f0..46ab109fc557 100644 --- a/packages/sacado/src/Sacado_cmath.hpp +++ b/packages/sacado/src/Sacado_cmath.hpp @@ -64,6 +64,15 @@ namespace Sacado { return cond ? a : b; } + // Special version of sqrt(x) that avoids the NaN if x==0 in the derivative. + // The default implementation just calls the standard sqrt(x). + template + KOKKOS_INLINE_FUNCTION + T safe_sqrt(const T& x) { + using std::sqrt; + return sqrt(x); + } + } #endif // SACADO_CMATH_HPP diff --git a/packages/sacado/src/new_design/Sacado_Fad_Exp_MathFunctions.hpp b/packages/sacado/src/new_design/Sacado_Fad_Exp_MathFunctions.hpp index 7fb1b65d7a9b..ec62fa98f1c5 100644 --- a/packages/sacado/src/new_design/Sacado_Fad_Exp_MathFunctions.hpp +++ b/packages/sacado/src/new_design/Sacado_Fad_Exp_MathFunctions.hpp @@ -61,6 +61,7 @@ UNARYFUNC_MACRO(exp, ExpOp) UNARYFUNC_MACRO(log, LogOp) UNARYFUNC_MACRO(log10, Log10Op) UNARYFUNC_MACRO(sqrt, SqrtOp) +UNARYFUNC_MACRO(safe_sqrt, SafeSqrtOp) UNARYFUNC_MACRO(cos, CosOp) UNARYFUNC_MACRO(sin, SinOp) UNARYFUNC_MACRO(tan, TanOp) diff --git a/packages/sacado/src/new_design/Sacado_Fad_Exp_Ops.hpp b/packages/sacado/src/new_design/Sacado_Fad_Exp_Ops.hpp index f684bac6f662..c57c4b278fcc 100644 --- a/packages/sacado/src/new_design/Sacado_Fad_Exp_Ops.hpp +++ b/packages/sacado/src/new_design/Sacado_Fad_Exp_Ops.hpp @@ -277,6 +277,162 @@ FAD_UNARYOP_MACRO(cbrt, expr.dx(i)/(value_type(3)*cbrt(expr.val()*expr.val())), expr.fastAccessDx(i)/(value_type(3)*cbrt(expr.val()*expr.val()))) +// Special handling for safe_sqrt() to provide specializations of SafeSqrtOp for +// "simd" value types that use if_then_else(). The only reason for not using +// if_then_else() always is to avoid evaluating the derivative if the value is +// zero to avoid throwing FPEs. +namespace Sacado { + namespace Fad { + namespace Exp { + + template + class SafeSqrtOp {}; + + // + // Implementation for simd type using if_then_else() + // + template + class SafeSqrtOp< T,ExprSpecDefault,true > : + public Expr< SafeSqrtOp > { + public: + + typedef typename std::remove_cv::type ExprT; + typedef typename ExprT::value_type value_type; + typedef typename ExprT::scalar_type scalar_type; + + typedef ExprSpecDefault expr_spec_type; + + KOKKOS_INLINE_FUNCTION + explicit SafeSqrtOp(const T& expr_) : expr(expr_) {} + + KOKKOS_INLINE_FUNCTION + int size() const { return expr.size(); } + + KOKKOS_INLINE_FUNCTION + bool hasFastAccess() const { + return expr.hasFastAccess(); + } + + KOKKOS_INLINE_FUNCTION + value_type val() const { + using std::sqrt; + return sqrt(expr.val()); + } + + KOKKOS_INLINE_FUNCTION + value_type dx(int i) const { + using std::sqrt; + return if_then_else( + expr.val() == value_type(0.0), value_type(0.0), + value_type(expr.dx(i)/(value_type(2)*sqrt(expr.val())))); + } + + KOKKOS_INLINE_FUNCTION + value_type fastAccessDx(int i) const { + using std::sqrt; + return if_then_else( + expr.val() == value_type(0.0), value_type(0.0), + value_type(expr.fastAccessDx(i)/(value_type(2)*sqrt(expr.val())))); + } + + protected: + + const T& expr; + }; + + // + // Specialization for scalar types using ternary operator + // + template + class SafeSqrtOp< T,ExprSpecDefault,false > : + public Expr< SafeSqrtOp > { + public: + + typedef typename std::remove_cv::type ExprT; + typedef typename ExprT::value_type value_type; + typedef typename ExprT::scalar_type scalar_type; + + typedef ExprSpecDefault expr_spec_type; + + KOKKOS_INLINE_FUNCTION + explicit SafeSqrtOp(const T& expr_) : expr(expr_) {} + + KOKKOS_INLINE_FUNCTION + int size() const { return expr.size(); } + + KOKKOS_INLINE_FUNCTION + bool hasFastAccess() const { + return expr.hasFastAccess(); + } + + KOKKOS_INLINE_FUNCTION + value_type val() const { + using std::sqrt; + return sqrt(expr.val()); + } + + KOKKOS_INLINE_FUNCTION + value_type dx(int i) const { + using std::sqrt; + return expr.val() == value_type(0.0) ? value_type(0.0) : + value_type(expr.dx(i)/(value_type(2)*sqrt(expr.val()))); + } + + KOKKOS_INLINE_FUNCTION + value_type fastAccessDx(int i) const { + using std::sqrt; + return expr.val() == value_type(0.0) ? value_type(0.0) : + value_type(expr.fastAccessDx(i)/(value_type(2)*sqrt(expr.val()))); + } + + protected: + + const T& expr; + }; + + template + KOKKOS_INLINE_FUNCTION + SafeSqrtOp< typename Expr::derived_type, + typename T::expr_spec_type > + safe_sqrt (const Expr& expr) + { + typedef SafeSqrtOp< typename Expr::derived_type, + typename T::expr_spec_type > expr_t; + + return expr_t(expr.derived()); + } + + template + struct ExprLevel< SafeSqrtOp< T,E > > { + static const unsigned value = ExprLevel::value; + }; + + template + struct IsFadExpr< SafeSqrtOp< T,E > > { + static const unsigned value = true; + }; + + } + } + + template + struct IsExpr< Fad::Exp::SafeSqrtOp< T,E > > { + static const bool value = true; + }; + + template + struct BaseExprType< Fad::Exp::SafeSqrtOp< T,E > > { + typedef typename BaseExprType::type type; + }; + + template + struct IsSimdType< Fad::Exp::SafeSqrtOp< T,E > > { + static const bool value = + IsSimdType< typename Fad::Exp::SafeSqrtOp< T,E >::scalar_type >::value; + }; + +} + #undef FAD_UNARYOP_MACRO #define FAD_BINARYOP_MACRO(OPNAME,OP,USING,VALUE,DX,CDX1,CDX2,FASTACCESSDX,VAL_CONST_DX_1,VAL_CONST_DX_2,CONST_DX_1,CONST_DX_2,CONST_FASTACCESSDX_1,CONST_FASTACCESSDX_2) \ diff --git a/packages/sacado/src/new_design/Sacado_Fad_Exp_Ops_Fwd.hpp b/packages/sacado/src/new_design/Sacado_Fad_Exp_Ops_Fwd.hpp index 790819c3e37d..a671538292dd 100644 --- a/packages/sacado/src/new_design/Sacado_Fad_Exp_Ops_Fwd.hpp +++ b/packages/sacado/src/new_design/Sacado_Fad_Exp_Ops_Fwd.hpp @@ -58,6 +58,8 @@ namespace Sacado { template class AbsOp; template class FAbsOp; template class CbrtOp; + template ::value> + class SafeSqrtOp; template class AdditionOp; diff --git a/packages/sacado/test/UnitTests/CMakeLists.txt b/packages/sacado/test/UnitTests/CMakeLists.txt index 4420611b8a29..620e9ffb5620 100644 --- a/packages/sacado/test/UnitTests/CMakeLists.txt +++ b/packages/sacado/test/UnitTests/CMakeLists.txt @@ -333,4 +333,13 @@ IF (Sacado_ENABLE_TeuchosCore) NUM_MPI_PROCS 1 STANDARD_PASS_OUTPUT ) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + SafeSqrtTests + SOURCES SafeSqrtTests.cpp + ARGS + COMM serial mpi + NUM_MPI_PROCS 1 + STANDARD_PASS_OUTPUT + ) ENDIF() diff --git a/packages/sacado/test/UnitTests/SafeSqrtTests.cpp b/packages/sacado/test/UnitTests/SafeSqrtTests.cpp new file mode 100644 index 000000000000..1fbdda678520 --- /dev/null +++ b/packages/sacado/test/UnitTests/SafeSqrtTests.cpp @@ -0,0 +1,110 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#include "Teuchos_UnitTestHarness.hpp" +#include "Teuchos_UnitTestRepository.hpp" +#include "Teuchos_GlobalMPISession.hpp" +#include "Teuchos_TestingHelpers.hpp" + +#include "Sacado.hpp" + +const int N = 10; + +// Check whether the safe_sqrt() function works as expected +TEUCHOS_UNIT_TEST_TEMPLATE_1_DECL( SafeSqrt, SafeSqrt, AD ) +{ + typedef AD ad_type; + + success = true; + + // Check non-zero value + ad_type x(N, 1.5); + for (int i=0; i Fad_DFadType; +typedef Sacado::Fad::SLFad Fad_SLFadType; +typedef Sacado::Fad::SFad Fad_SFadType; +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, Fad_DFadType ) +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, Fad_SLFadType ) +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, Fad_SFadType ) + +typedef Sacado::ELRFad::DFad ELRFad_DFadType; +typedef Sacado::ELRFad::SLFad ELRFad_SLFadType; +typedef Sacado::ELRFad::SFad ELRFad_SFadType; +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, ELRFad_DFadType ) +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, ELRFad_SLFadType ) +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, ELRFad_SFadType ) + +typedef Sacado::CacheFad::DFad CacheFad_DFadType; +typedef Sacado::CacheFad::SLFad CacheFad_SLFadType; +typedef Sacado::CacheFad::SFad CacheFad_SFadType; +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, CacheFad_DFadType ) +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, CacheFad_SLFadType ) +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, CacheFad_SFadType ) + +typedef Sacado::ELRCacheFad::DFad ELRCacheFad_DFadType; +typedef Sacado::ELRCacheFad::SLFad ELRCacheFad_SLFadType; +typedef Sacado::ELRCacheFad::SFad ELRCacheFad_SFadType; +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, ELRCacheFad_DFadType ) +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, ELRCacheFad_SLFadType ) +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, ELRCacheFad_SFadType ) + +#if defined(SACADO_ENABLE_NEW_DESIGN) && !defined(SACADO_NEW_FAD_DESIGN_IS_DEFAULT) +typedef Sacado::Fad::Exp::DFad ExpFad_DFadType; +typedef Sacado::Fad::Exp::SLFad ExpFad_SLFadType; +typedef Sacado::Fad::Exp::SFad ExpFad_SFadType; +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, ExpFad_DFadType ) +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, ExpFad_SLFadType ) +TEUCHOS_UNIT_TEST_TEMPLATE_1_INSTANT( SafeSqrt, SafeSqrt, ExpFad_SFadType ) +#endif + +int main( int argc, char* argv[] ) { + Teuchos::GlobalMPISession mpiSession(&argc, &argv); + return Teuchos::UnitTestRepository::runUnitTestsFromMain(argc, argv); +} From 3cf736d822d57ec893e26392a2fa048d66324b6f Mon Sep 17 00:00:00 2001 From: Eric Phipps Date: Sat, 6 Jun 2020 09:53:19 -0600 Subject: [PATCH 5/5] Sacado: Fix SafeSqrt expression templates for old design. --- packages/sacado/src/Sacado_Fad_Ops.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/sacado/src/Sacado_Fad_Ops.hpp b/packages/sacado/src/Sacado_Fad_Ops.hpp index 8a34e94d6026..19d22765dce9 100644 --- a/packages/sacado/src/Sacado_Fad_Ops.hpp +++ b/packages/sacado/src/Sacado_Fad_Ops.hpp @@ -289,7 +289,7 @@ namespace Sacado { // Implementation for simd type using if_then_else() // template - class Expr< SafeSqrt,ExprSpecDefault,true > { + class Expr< SafeSqrtOp,ExprSpecDefault > { public: typedef typename ExprT::value_type value_type; @@ -345,7 +345,7 @@ namespace Sacado { // Specialization for scalar types using ternary operator // template - class Expr< SafeSqrt,ExprSpecDefault,false > { + class Expr< SafeSqrtOp,ExprSpecDefault > { public: typedef typename ExprT::value_type value_type; @@ -400,7 +400,7 @@ namespace Sacado { Expr< SafeSqrtOp< Expr > > safe_sqrt (const Expr& expr) { - typedef OP< Expr > expr_t; + typedef SafeSqrtOp< Expr > expr_t; return Expr(expr); }