diff --git a/packages/sacado/src/Sacado_Fad_ViewStorage.hpp b/packages/sacado/src/Sacado_Fad_ViewStorage.hpp index 1364b745d40e..3b41e3e7d98c 100644 --- a/packages/sacado/src/Sacado_Fad_ViewStorage.hpp +++ b/packages/sacado/src/Sacado_Fad_ViewStorage.hpp @@ -56,6 +56,14 @@ namespace Sacado { namespace Fad { +#ifndef SACADO_FAD_DERIV_LOOP +#if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__) +#define SACADO_FAD_DERIV_LOOP(I,SZ) for (int I=threadIdx.x; I class ViewFadPtr; @@ -118,10 +126,12 @@ namespace Sacado { reinterpret_cast(x)))) { *val_ = *x.val_; if (stride_one) - for (int i=0; i +void run_fad_flat(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename ResidualView::non_const_value_type scalar_type; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + Kokkos::parallel_for(Kokkos::RangePolicy( 0,num_cells ), + KOKKOS_LAMBDA (const size_t cell) + { + scalar_type value, value2; + for (int basis=0; basis +void run_fad_scratch(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename ResidualView::non_const_value_type scalar_type; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type team_member; + typedef Kokkos::View > tmp_scratch_type; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + const int vector_size = 1; + const int team_size = is_cuda_space::value ? 32 : 1; + const int fad_size = Kokkos::dimension_scalar(residual); + const size_t range = (num_cells+team_size-1)/team_size; + const size_t bytes = 2*tmp_scratch_type::shmem_size(team_size,fad_size); + policy_type policy(range,team_size,vector_size); + + Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)), + KOKKOS_LAMBDA (const team_member& team) + { + const int team_rank = team.team_rank(); + tmp_scratch_type value(team.team_scratch(0), team_size, fad_size); + tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size); + const size_t cell = team.league_rank()*team_size + team_rank; + if (cell < num_cells) { + for (int basis=0; basis +void run_analytic_flat(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename ResidualView::non_const_value_type scalar_type; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + Kokkos::parallel_for(Kokkos::RangePolicy( 0,num_cells ), + KOKKOS_LAMBDA (const size_t cell) + { + scalar_type value[N+1],value2[N+1]; + for (int basis=0; basis +void run_analytic_team(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename ResidualView::non_const_value_type scalar_type; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type team_member; + typedef Kokkos::View > tmp_scratch_type; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + /*const*/ int num_points = wgb.extent(2); + /*const*/ int num_dim = wgb.extent(3); + + const size_t bytes = 2*tmp_scratch_type::shmem_size(); + policy_type policy(num_cells,num_basis,32); + Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerThread(bytes)), + KOKKOS_LAMBDA (const team_member& team) + { + tmp_scratch_type value(team.thread_scratch(0)); + tmp_scratch_type value2(team.thread_scratch(0)); + const size_t cell = team.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_basis), + [&] (const int& basis) + { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N+1), + [&] (const int& k) + { + value(k) = 0; + value2(k) = 0; + }); + for (int qp=0; qp +double time_fad_flat(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check) +{ + typedef Kokkos::View t_4DView; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView wgb("",ncells,num_basis,num_points,ndim,N+1); + t_3DView flux("",ncells,num_points,ndim,N+1); + t_3DView wbs("",ncells,num_basis,num_points,N+1); + t_2DView src("",ncells,num_points,N+1); + t_2DView residual("",ncells,num_basis,N+1); + init_fad(wgb, wbs, flux, src, residual); + + // Run once to warm up, complete any UVM transfers + run_fad_flat(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i +double time_fad_scratch(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check) +{ + typedef Kokkos::View t_4DView; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView wgb("",ncells,num_basis,num_points,ndim,N+1); + t_3DView flux("",ncells,num_points,ndim,N+1); + t_3DView wbs("",ncells,num_basis,num_points,N+1); + t_2DView src("",ncells,num_points,N+1); + t_2DView residual("",ncells,num_basis,N+1); + init_fad(wgb, wbs, flux, src, residual); + + // Run once to warm up, complete any UVM transfers + run_fad_scratch(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i +double time_analytic_flat(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check) +{ + typedef Kokkos::View t_4DView; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView wgb("",ncells,num_basis,num_points,ndim); + t_3DView flux("",ncells,num_points,ndim); + t_3DView wbs("",ncells,num_basis,num_points); + t_2DView src("",ncells,num_points); + t_2DView residual("",ncells,num_basis); + init_array(wgb, wbs, flux, src, residual); + + // Run once to warm up, complete any UVM transfers + run_analytic_flat(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i(flux, wgb, src, wbs, residual); + Kokkos::fence(); + double time = timer.seconds() / ntrial / ncells; + + // Check result + if (check) + check_residual(flux, wgb, src, wbs, residual); + + return time; +} + +template +double time_analytic_const(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check) +{ + typedef Kokkos::View t_4DView; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + typedef Kokkos::View > t_3DView_const; + + t_4DView wgb("",ncells,num_basis,num_points,ndim); + t_3DView flux("",ncells,num_points,ndim); + t_3DView wbs("",ncells,num_basis,num_points); + t_2DView src("",ncells,num_points); + t_2DView residual("",ncells,num_basis); + init_array(wgb, wbs, flux, src, residual); + + t_3DView_const flux_const = flux; + + // Run once to warm up, complete any UVM transfers + run_analytic_flat(flux_const, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i(flux_const, wgb, src, wbs, residual); + Kokkos::fence(); + double time = timer.seconds() / ntrial / ncells; + + // Check result + if (check) + check_residual(flux, wgb, src, wbs, residual); + + return time; +} + +template +double time_analytic_team(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check) +{ + typedef Kokkos::View t_4DView; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + typedef Kokkos::View > t_3DView_const; + + t_4DView wgb("",ncells,num_basis,num_points,ndim); + t_3DView flux("",ncells,num_points,ndim); + t_3DView wbs("",ncells,num_basis,num_points); + t_2DView src("",ncells,num_points); + t_2DView residual("",ncells,num_basis); + init_array(wgb, wbs, flux, src, residual); + + t_3DView_const flux_const = flux; + + // Run once to warm up, complete any UVM transfers + run_analytic_team(flux_const, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i(flux_const, wgb, src, wbs, residual); + Kokkos::fence(); + double time = timer.seconds() / ntrial / ncells; + + // Check result + if (check) + check_residual(flux, wgb, src, wbs, residual); + + return time; +} + +#define INST_FUNC_FAD_N_DEV(FAD,N,DEV) \ + template double time_fad_flat< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ + template double time_fad_scratch< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); + +#define INST_FUNC_N_DEV(N,DEV) \ + INST_FUNC_FAD_N_DEV(SFadType,N,DEV) \ + INST_FUNC_FAD_N_DEV(DFadType,N,DEV) \ + template double time_analytic_flat< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ + template double time_analytic_const< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ + template double time_analytic_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); + +#define INST_FUNC_DEV(DEV) \ + INST_FUNC_N_DEV( fad_dim, DEV ) + +#ifdef KOKKOS_ENABLE_SERIAL +INST_FUNC_DEV(Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +INST_FUNC_DEV(Kokkos::OpenMP) +#endif + +#ifdef KOKKOS_ENABLE_THREADS +INST_FUNC_DEV(Kokkos::Threads) +#endif + +#ifdef KOKKOS_ENABLE_CUDA +INST_FUNC_DEV(Kokkos::Cuda) +#endif diff --git a/packages/sacado/test/performance/advection/advection.hpp b/packages/sacado/test/performance/advection/advection.hpp new file mode 100644 index 000000000000..adca588cee11 --- /dev/null +++ b/packages/sacado/test/performance/advection/advection.hpp @@ -0,0 +1,50 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#pragma once + +template +double time_fad_flat(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check); + +template +double time_fad_scratch(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check); + +template +double time_analytic_flat(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check); + +template +double time_analytic_const(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check); + +template +double time_analytic_team(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check); diff --git a/packages/sacado/test/performance/advection/advection_hierarchical.cpp b/packages/sacado/test/performance/advection/advection_hierarchical.cpp new file mode 100644 index 000000000000..c5858b5299b9 --- /dev/null +++ b/packages/sacado/test/performance/advection/advection_hierarchical.cpp @@ -0,0 +1,228 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#define SACADO_VIEW_CUDA_HIERARCHICAL 1 +#define SACADO_ALIGN_SFAD 1 + +#include "Sacado.hpp" +#include "advection_hierarchical.hpp" +#include "common.hpp" + +#include "impl/Kokkos_Timer.hpp" + +template +void run_fad_hierarchical_flat(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename Kokkos::ThreadLocalScalarType::type local_scalar_type; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type team_member; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + const bool is_cuda = is_cuda_space::value; + const int vector_size = is_cuda ? 32 : 1; + const int team_size = is_cuda ? 256/vector_size : 1; + const size_t range = (num_cells+team_size-1)/team_size; + + policy_type policy(range,team_size,vector_size); + Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team) + { + const size_t cell = team.league_rank()*team_size + team.team_rank(); + local_scalar_type value, value2; + for (int basis=0; basis +void run_fad_hierarchical_team(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename Kokkos::ThreadLocalScalarType::type local_scalar_type; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type team_member; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + const bool is_cuda = is_cuda_space::value; + const int vector_size = is_cuda ? 32 : 1; + const int team_size = is_cuda ? 256/vector_size : 1; + + policy_type policy(num_cells,team_size,vector_size); + Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team) + { + const int team_rank = team.team_rank(); + const size_t cell = team.league_rank(); + local_scalar_type value, value2; + for (int basis=team_rank; basis +double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points, + int ndim, int ntrial, bool check) +{ + typedef Sacado::Fad::SFad FadType; + + static const int FadStride = is_cuda_space::value ? 32 : 1; +#if defined(SACADO_ALIGN_SFAD) + static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride; + typedef typename FadType::template apply_N::type AlignedFadType; +#else + typedef FadType AlignedFadType; +#endif + + typedef typename ExecSpace::array_layout DefaultLayout; + typedef Kokkos::LayoutContiguous ContLayout; + typedef Kokkos::View t_4DView; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView wgb("",ncells,num_basis,num_points,ndim,N+1); + t_3DView flux("",ncells,num_points,ndim,N+1); + t_3DView wbs("",ncells,num_basis,num_points,N+1); + t_2DView src("",ncells,num_points,N+1); + t_2DView residual("",ncells,num_basis,N+1); + init_fad(wgb, wbs, flux, src, residual); + + // Run once to warm up, complete any UVM transfers + run_fad_hierarchical_flat(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i +double time_fad_hierarchical_team(int ncells, int num_basis, int num_points, + int ndim, int ntrial, bool check) +{ + typedef Sacado::Fad::SFad FadType; + + static const int FadStride = is_cuda_space::value ? 32 : 1; +#if defined(SACADO_ALIGN_SFAD) + static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride; + typedef typename FadType::template apply_N::type AlignedFadType; +#else + typedef FadType AlignedFadType; +#endif + + typedef typename ExecSpace::array_layout DefaultLayout; + typedef Kokkos::LayoutContiguous ContLayout; + typedef Kokkos::View t_4DView; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView wgb("",ncells,num_basis,num_points,ndim,N+1); + t_3DView flux("",ncells,num_points,ndim,N+1); + t_3DView wbs("",ncells,num_basis,num_points,N+1); + t_2DView src("",ncells,num_points,N+1); + t_2DView residual("",ncells,num_basis,N+1); + init_fad(wgb, wbs, flux, src, residual); + + // Run once to warm up, complete any UVM transfers + run_fad_hierarchical_team(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ + template double time_fad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); + +#define INST_FUNC_DEV(DEV) \ + INST_FUNC_N_DEV( fad_dim, DEV ) + +#ifdef KOKKOS_ENABLE_SERIAL +INST_FUNC_DEV(Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +INST_FUNC_DEV(Kokkos::OpenMP) +#endif + +#ifdef KOKKOS_ENABLE_THREADS +INST_FUNC_DEV(Kokkos::Threads) +#endif + +#ifdef KOKKOS_ENABLE_CUDA +INST_FUNC_DEV(Kokkos::Cuda) +#endif diff --git a/packages/sacado/test/performance/advection/advection_hierarchical.hpp b/packages/sacado/test/performance/advection/advection_hierarchical.hpp new file mode 100644 index 000000000000..f78f4316897b --- /dev/null +++ b/packages/sacado/test/performance/advection/advection_hierarchical.hpp @@ -0,0 +1,38 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#pragma once + +template +double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points, + int ndim, int ntrial, bool check); + +template +double time_fad_hierarchical_team(int ncells, int num_basis, int num_points, + int ndim, int ntrial, bool check); diff --git a/packages/sacado/test/performance/advection/advection_hierarchical_dfad.cpp b/packages/sacado/test/performance/advection/advection_hierarchical_dfad.cpp new file mode 100644 index 000000000000..8e93b4a74256 --- /dev/null +++ b/packages/sacado/test/performance/advection/advection_hierarchical_dfad.cpp @@ -0,0 +1,237 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1 +#define SACADO_KOKKOS_USE_MEMORY_POOL 1 + +#include "Sacado.hpp" +#include "advection_hierarchical_dfad.hpp" +#include "common.hpp" + +#include "impl/Kokkos_Timer.hpp" + +template +void run_dfad_hierarchical_team(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename ResidualView::non_const_value_type scalar_type; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type team_member; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + const bool is_cuda = is_cuda_space::value; + const int vector_size = is_cuda ? 32 : 1; + const int team_size = is_cuda ? 256/vector_size : 1; + + policy_type policy(num_cells,team_size,vector_size); + Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team) + { + const int team_rank = team.team_rank(); + const size_t cell = team.league_rank(); + scalar_type value, value2; + for (int basis=team_rank; basis +void run_dfad_hierarchical_team_scratch( + const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename ResidualView::non_const_value_type scalar_type; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type team_member; + typedef Kokkos::View > tmp_scratch_type; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + const bool is_cuda = is_cuda_space::value; + const int vector_size = is_cuda ? 32 : 1; + const int team_size = is_cuda ? 256/vector_size : 1; + const int fad_size = Kokkos::dimension_scalar(residual); + const size_t bytes = 2*tmp_scratch_type::shmem_size(team_size,fad_size); + policy_type policy(num_cells,team_size,vector_size); + + Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)), + KOKKOS_LAMBDA (const team_member& team) + { + const int team_rank = team.team_rank(); + const size_t cell = team.league_rank(); + tmp_scratch_type value(team.team_scratch(0), team_size, fad_size); + tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size); + for (int basis=team_rank; basis +double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points, + int ndim, int ntrial, bool check) +{ + typedef Sacado::Fad::DFad FadType; + + typedef typename ExecSpace::array_layout DefaultLayout; + typedef Kokkos::LayoutContiguous ContLayout; + typedef Kokkos::View t_4DView; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView wgb("",ncells,num_basis,num_points,ndim,N+1); + t_3DView flux("",ncells,num_points,ndim,N+1); + t_3DView wbs("",ncells,num_basis,num_points,N+1); + t_2DView src("",ncells,num_points,N+1); + t_2DView residual("",ncells,num_basis,N+1); + init_fad(wgb, wbs, flux, src, residual); + + // Create memory pool for DFad + // The kernel allocates 2*N double's per warp on Cuda. Approximate + // the maximum number of warps as the maximum concurrency / 32. + // Include a fudge factor of 1.2 since memory pool treats a block as full + // once it reaches 80% capacity + const size_t block_size = N*sizeof(double); + size_t nkernels = ExecSpace::concurrency()*2; + if (is_cuda_space::value) + nkernels /= 32; + const size_t mem_pool_size = static_cast(1.2*nkernels*block_size); + const size_t superblock_size = + std::max(nkernels / 100, 1) * block_size; + ExecSpace exec_space; + Sacado::createGlobalMemoryPool(exec_space, mem_pool_size, + block_size, block_size, superblock_size); + + // Run once to warm up, complete any UVM transfers + run_dfad_hierarchical_team(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i +double time_dfad_hierarchical_team_scratch( + int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check) +{ + typedef Sacado::Fad::DFad FadType; + + typedef typename ExecSpace::array_layout DefaultLayout; + typedef Kokkos::LayoutContiguous ContLayout; + typedef Kokkos::View t_4DView; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView wgb("",ncells,num_basis,num_points,ndim,N+1); + t_3DView flux("",ncells,num_points,ndim,N+1); + t_3DView wbs("",ncells,num_basis,num_points,N+1); + t_2DView src("",ncells,num_points,N+1); + t_2DView residual("",ncells,num_basis,N+1); + init_fad(wgb, wbs, flux, src, residual); + + // Run once to warm up, complete any UVM transfers + run_dfad_hierarchical_team_scratch(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ + template double time_dfad_hierarchical_team_scratch< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); + +#define INST_FUNC_DEV(DEV) \ + INST_FUNC_N_DEV( fad_dim, DEV ) + +#ifdef KOKKOS_ENABLE_SERIAL +INST_FUNC_DEV(Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +INST_FUNC_DEV(Kokkos::OpenMP) +#endif + +#ifdef KOKKOS_ENABLE_THREADS +INST_FUNC_DEV(Kokkos::Threads) +#endif + +#ifdef KOKKOS_ENABLE_CUDA +INST_FUNC_DEV(Kokkos::Cuda) +#endif diff --git a/packages/sacado/test/performance/advection/advection_hierarchical_dfad.hpp b/packages/sacado/test/performance/advection/advection_hierarchical_dfad.hpp new file mode 100644 index 000000000000..9933d1102f73 --- /dev/null +++ b/packages/sacado/test/performance/advection/advection_hierarchical_dfad.hpp @@ -0,0 +1,39 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#pragma once + +template +double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points, + int ndim, int ntrial, bool check); + +template +double time_dfad_hierarchical_team_scratch(int ncells, int num_basis, + int num_points, int ndim, int ntrial, + bool check); diff --git a/packages/sacado/test/performance/advection/common.hpp b/packages/sacado/test/performance/advection/common.hpp new file mode 100644 index 000000000000..2a518a0b646d --- /dev/null +++ b/packages/sacado/test/performance/advection/common.hpp @@ -0,0 +1,387 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#pragma once + +const int fad_dim = 50; +typedef Sacado::Fad::SFad SFadType; +typedef Sacado::Fad::DFad DFadType; + +template +struct is_cuda_space { + static const bool value = false; +}; + +#ifdef KOKKOS_ENABLE_CUDA +template <> +struct is_cuda_space { + static const bool value = true; +}; +#endif + +template +scalar +generate_fad(const size_t n0, const size_t n1, + const size_t n2, const size_t n3, const int fad_size, + const size_t i0, const size_t i1, + const size_t i2, const size_t i3, + const int i_fad) +{ + const scalar x0 = 10.0 + scalar(n0) / scalar(i0+1); + const scalar x1 = 100.0 + scalar(n1) / scalar(i1+1); + const scalar x2 = 1000.0 + scalar(n2) / scalar(i2+1); + const scalar x3 = 10000.0 + scalar(n3) / scalar(i3+1); + const scalar x = x0 + x1 + x2 + x3; + if (i_fad == fad_size) + return x; + const scalar x_fad = 1.0 + scalar(fad_size) / scalar(i_fad+1); + return x + x_fad; +} + +template +void init_fad(const V1& v1, const V2& v2, const V3& v3, const V4& v4, + const V5& v5) +{ + typedef typename V1::non_const_value_type::value_type scalar; + + const int ncells = v1.extent(0); + const int num_basis = v1.extent(1); + const int num_points = v1.extent(2); + const int ndim = v1.extent(3); + const int N = Kokkos::dimension_scalar(v1)-1; + + // Kokkos::deep_copy(typename V1::array_type(v1), 1.0); + // Kokkos::deep_copy(typename V2::array_type(v2), 2.0); + // Kokkos::deep_copy(typename V3::array_type(v3), 3.0); + // Kokkos::deep_copy(typename V4::array_type(v4), 4.0); + + auto v1_h = Kokkos::create_mirror_view(v1); + auto v2_h = Kokkos::create_mirror_view(v2); + auto v3_h = Kokkos::create_mirror_view(v3); + auto v4_h = Kokkos::create_mirror_view(v4); + for (int cell=0; cell(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,i); + v1_h(cell,basis,qp,dim).val() = + generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N); + } + for (int i=0; i(ncells,num_basis,num_points,1,N,cell,basis,qp,0,i); + v2_h(cell,basis,qp).val() = + generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N); + } + } + for (int qp=0; qp(ncells,1,num_points,ndim,N,cell,0,qp,dim,i); + v3_h(cell,qp,dim).val() = + generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,N); + } + for (int i=0; i(ncells,1,num_points,1,N,cell,0,qp,0,i); + v4_h(cell,qp).val() = + generate_fad(ncells,1,num_points,1,N,cell,0,qp,0,N); + } + } + + Kokkos::deep_copy( v1, v1_h ); + Kokkos::deep_copy( v2, v2_h ); + Kokkos::deep_copy( v3, v3_h ); + Kokkos::deep_copy( v4, v4_h ); + + Kokkos::deep_copy(typename V5::array_type(v5), 0.0); +} + +template +void init_array(const V1& v1, const V2& v2, const V3& v3, const V4& v4, + const V5& v5) +{ + typedef typename V1::non_const_value_type scalar; + + const int ncells = v1.extent(0); + const int num_basis = v1.extent(1); + const int num_points = v1.extent(2); + const int ndim = v1.extent(3); + const int N = v1.extent(4)-1; + + // Kokkos::deep_copy(typename V1::array_type(v1), 1.0); + // Kokkos::deep_copy(typename V2::array_type(v2), 2.0); + // Kokkos::deep_copy(typename V3::array_type(v3), 3.0); + // Kokkos::deep_copy(typename V4::array_type(v4), 4.0); + + auto v1_h = Kokkos::create_mirror_view(v1); + auto v2_h = Kokkos::create_mirror_view(v2); + auto v3_h = Kokkos::create_mirror_view(v3); + auto v4_h = Kokkos::create_mirror_view(v4); + for (int cell=0; cell(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,i); + v1_h(cell,basis,qp,dim,N) = + generate_fad(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N); + } + for (int i=0; i(ncells,num_basis,num_points,1,N,cell,basis,qp,0,i); + v2_h(cell,basis,qp,N) = + generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N); + } + } + for (int qp=0; qp(ncells,1,num_points,ndim,N,cell,0,qp,dim,i); + v3_h(cell,qp,dim,N) = + generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,N); + } + for (int i=0; i(ncells,1,num_points,1,N,cell,0,qp,0,i); + v4_h(cell,qp,N) = + generate_fad(ncells,1,num_points,1,N,cell,0,qp,0,N); + } + } + + Kokkos::deep_copy( v1, v1_h ); + Kokkos::deep_copy( v2, v2_h ); + Kokkos::deep_copy( v3, v3_h ); + Kokkos::deep_copy( v4, v4_h ); + + Kokkos::deep_copy(typename V5::array_type(v5), 0.0); +} + +template +typename std::enable_if< !Kokkos::is_view_fad::value, bool>::type +check(const View1& v_gold, const View2& v, const double tol) +{ + // Copy to host + typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold); + typename View2::HostMirror v_h = Kokkos::create_mirror_view(v); + Kokkos::deep_copy(v_gold_h, v_gold); + Kokkos::deep_copy(v_h, v); + + typedef typename View1::value_type value_type; + + const size_t n0 = v_gold_h.extent(0); + const size_t n1 = v_gold_h.extent(1); + const size_t n2 = v_gold_h.extent(2); + + bool success = true; + for ( size_t i0 = 0 ; i0 < n0 ; ++i0 ) { + for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { + value_type x_gold = v_gold_h(i0,i1,i2); + value_type x = v_h(i0,i1,i2); + if (std::abs(x_gold-x) > tol*std::abs(x_gold)) { + std::cout << "Comparison failed! x_gold(" + << i0 << "," << i1 << "," << i2 << ") = " + << x_gold << " , x = " << x + << std::endl; + success = false; + } + } + } + } + + return success; +} + +template +typename std::enable_if< Kokkos::is_view_fad::value, bool>::type +check(const View1& v_gold, const View2& v, const double tol) +{ + // Copy to host + typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold); + typename View2::HostMirror v_h = Kokkos::create_mirror_view(v); + Kokkos::deep_copy(v_gold_h, v_gold); + Kokkos::deep_copy(v_h, v); + + typedef typename View1::value_type value_type; + + const size_t n0 = v_gold_h.extent(0); + const size_t n1 = v_gold_h.extent(1); + const size_t n2 = v_gold_h.extent(2); + + bool success = true; + for ( size_t i0 = 0 ; i0 < n0 ; ++i0 ) { + for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { + value_type x_gold = v_gold_h(i0,i1,i2); + value_type x = (i2 == n2-1) ? v_h(i0,i1).val() : v_h(i0,i1).dx(i2); + if (std::abs(x_gold-x) > tol*std::abs(x_gold)) { + std::cout << "Comparison failed! x_gold(" + << i0 << "," << i1 << "," << i2 << ") = " + << x_gold << " , x = " << x + << std::endl; + success = false; + } + } + } + } + + return success; +} + +template +Kokkos::View +compute_gold_residual( + const FluxView& flux, const WgbView& wgb, const SrcView& src, + const WbsView& wbs, + typename std::enable_if< Kokkos::is_view_fad::value>::type* = 0) +{ + typedef typename FluxView::execution_space execution_space; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + const int N = Kokkos::dimension_scalar(wgb)-1; + + Kokkos::View residual( + "",num_cells,num_basis,N+1); + + Kokkos::parallel_for(Kokkos::RangePolicy( 0,num_cells ), + KOKKOS_LAMBDA (const size_t cell) + { + double value, value2; + + // Value + for (int basis=0; basis +Kokkos::View +compute_gold_residual( + const FluxView& flux, const WgbView& wgb, const SrcView& src, + const WbsView& wbs, + typename std::enable_if< !Kokkos::is_view_fad::value>::type* = 0) +{ + typedef typename FluxView::execution_space execution_space; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + const int N = wgb.extent(4)-1; + + Kokkos::View residual( + "",num_cells,num_basis,N+1); + + Kokkos::parallel_for(Kokkos::RangePolicy( 0,num_cells ), + KOKKOS_LAMBDA (const size_t cell) + { + double value, value2; + + // Value + for (int basis=0; basis +void check_residual(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + // Generate gold residual + auto residual_gold = compute_gold_residual(flux, wgb, src, wbs); + + // Compare residual and residual_gold + const double tol = 1.0e-14; + check(residual_gold, residual, tol); +} diff --git a/packages/sacado/test/performance/advection/driver.cpp b/packages/sacado/test/performance/advection/driver.cpp new file mode 100644 index 000000000000..1812cc80ed0a --- /dev/null +++ b/packages/sacado/test/performance/advection/driver.cpp @@ -0,0 +1,164 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +// A performance test that computes the derivative of a simple Kokkos kernel +// using various Fad classes + +#include "Sacado.hpp" + +#include "advection.hpp" +#include "advection_hierarchical.hpp" +#include "advection_hierarchical_dfad.hpp" +#include "common.hpp" + +#include "Teuchos_CommandLineProcessor.hpp" +#include "Teuchos_StandardCatchMacros.hpp" + +template +void run(const int cell_begin, const int cell_end, const int cell_step, + const int nbasis, const int npoint, const int ntrial, const bool check) +{ + const int ndim = 3; + printf("ncell %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", "flat sfad", "flat dfad", "dfad sc", "analytic", "const", "team", "hier sfad", "hier dfad", "h dfad sc"); + for(int i=cell_begin; i<=cell_end; i+=cell_step) { + double sfad_flat = time_fad_flat( + i,nbasis,npoint,ndim,ntrial,check); + double dfad_flat = time_fad_flat( + i,nbasis,npoint,ndim,ntrial,check); + double dfad_scratch = time_fad_scratch( + i,nbasis,npoint,ndim,ntrial,check); + double analytic = time_analytic_flat( + i,nbasis,npoint,ndim,ntrial,check); + double analytic_const = time_analytic_const( + i,nbasis,npoint,ndim,ntrial,check); + double analytic_team = time_analytic_team( + i,nbasis,npoint,ndim,ntrial,check); + double sfad_hierarchical = time_fad_hierarchical_team( + i,nbasis,npoint,ndim,ntrial,check); + double dfad_hierarchical = time_dfad_hierarchical_team( + i,nbasis,npoint,ndim,ntrial,check); + double dfad_hierarchical_scratch = + time_dfad_hierarchical_team_scratch( + i,nbasis,npoint,ndim,ntrial,check); + printf("%5d %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e\n",i,sfad_flat,dfad_flat,dfad_scratch,analytic,analytic_const,analytic_team,sfad_hierarchical,dfad_hierarchical,dfad_hierarchical_scratch); + } +} + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc,argv); + + bool success = true; + try { + + // Set up command line options + Teuchos::CommandLineProcessor clp(false); + clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel"); +#ifdef KOKKOS_ENABLE_SERIAL + bool serial = 0; + clp.setOption("serial", "no-serial", &serial, "Whether to run Serial"); +#endif +#ifdef KOKKOS_ENABLE_OPENMP + bool openmp = 0; + clp.setOption("openmp", "no-openmp", &openmp, "Whether to run OpenMP"); +#endif +#ifdef KOKKOS_ENABLE_THREADS + bool threads = 0; + clp.setOption("threads", "no-threads", &threads, "Whether to run Threads"); +#endif +#ifdef KOKKOS_ENABLE_CUDA + bool cuda = 0; + clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA"); +#endif + bool print_config = false; + clp.setOption("print-config", "no-print-config", &print_config, + "Whether to print Kokkos device configuration"); + int cell_begin = 100; + clp.setOption("begin", &cell_begin, "Starting number of cells"); + int cell_end = 8000; + clp.setOption("end", &cell_end, "Ending number of cells"); + int cell_step = 100; + clp.setOption("step", &cell_step, "Cell increment"); + int nbasis = 8; + clp.setOption("basis", &nbasis, "Number of basis functions"); + int npoint = 8; + clp.setOption("point", &npoint, "Number of integration points"); + int ntrial = 5; + clp.setOption("trial", &ntrial, "Number of trials"); + bool check = false; + clp.setOption("check", "no-check", &check, + "Check correctness of results"); + + // Parse options + switch (clp.parse(argc, argv)) { + case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED: + return 0; + case Teuchos::CommandLineProcessor::PARSE_ERROR: + case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: + return 1; + case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL: + break; + } + + if (print_config) + Kokkos::print_configuration(std::cout, true); + +#ifdef KOKKOS_ENABLE_SERIAL + if (serial) { + using Kokkos::Serial; + run(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check); + } +#endif + +#ifdef KOKKOS_ENABLE_OPENMP + if (openmp) { + using Kokkos::OpenMP; + run(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check); + } +#endif + +#ifdef KOKKOS_ENABLE_THREADS + if (threads) { + using Kokkos::Threads; + run(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check); + } +#endif + +#ifdef KOKKOS_ENABLE_CUDA + if (cuda) { + using Kokkos::Cuda; + run(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check); + } +#endif + } + TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success); + + Kokkos::finalize(); + + return !success; +} diff --git a/packages/sacado/test/performance/advection_const_basis/CMakeLists.txt b/packages/sacado/test/performance/advection_const_basis/CMakeLists.txt new file mode 100644 index 000000000000..d2bd03b5b92f --- /dev/null +++ b/packages/sacado/test/performance/advection_const_basis/CMakeLists.txt @@ -0,0 +1,22 @@ +ASSERT_DEFINED(PACKAGE_SOURCE_DIR CMAKE_CURRENT_SOURCE_DIR) + +INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) + +IF (Sacado_ENABLE_KokkosCore AND Sacado_ENABLE_TeuchosCore) + + IF(NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.8"))) + + TRIBITS_ADD_EXECUTABLE( + FadAdvectionConstBasisHierarchicalTest + SOURCES common.hpp + advection.hpp advection.cpp + advection_hierarchical.hpp advection_hierarchical.cpp + advection_hierarchical_dfad.hpp advection_hierarchical_dfad.cpp + driver.cpp + COMM serial mpi + ) + + ENDIF() + +ENDIF() diff --git a/packages/sacado/test/performance/advection_const_basis/advection.cpp b/packages/sacado/test/performance/advection_const_basis/advection.cpp new file mode 100644 index 000000000000..fb95df285c5c --- /dev/null +++ b/packages/sacado/test/performance/advection_const_basis/advection.cpp @@ -0,0 +1,427 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#include "Sacado.hpp" +#include "advection.hpp" +#include "common.hpp" + +#include "impl/Kokkos_Timer.hpp" + +template +void run_fad_flat(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename ResidualView::non_const_value_type scalar_type; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + Kokkos::parallel_for(Kokkos::RangePolicy( 0,num_cells ), + KOKKOS_LAMBDA (const size_t cell) + { + scalar_type value, value2; + for (int basis=0; basis +void run_fad_scratch(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename ResidualView::non_const_value_type scalar_type; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type team_member; + typedef Kokkos::View > tmp_scratch_type; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + const int vector_size = 1; + const int team_size = is_cuda_space::value ? 32 : 1; + const int fad_size = Kokkos::dimension_scalar(residual); + const size_t range = (num_cells+team_size-1)/team_size; + const size_t bytes = 2*tmp_scratch_type::shmem_size(team_size,fad_size); + policy_type policy(range,team_size,vector_size); + + Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)), + KOKKOS_LAMBDA (const team_member& team) + { + const int team_rank = team.team_rank(); + tmp_scratch_type value(team.team_scratch(0), team_size, fad_size); + tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size); + const size_t cell = team.league_rank()*team_size + team_rank; + if (cell < num_cells) { + for (int basis=0; basis +void run_analytic_flat(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename ResidualView::non_const_value_type scalar_type; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + Kokkos::parallel_for(Kokkos::RangePolicy( 0,num_cells ), + KOKKOS_LAMBDA (const size_t cell) + { + scalar_type value[N+1],value2[N+1]; + for (int basis=0; basis +void run_analytic_team(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename ResidualView::non_const_value_type scalar_type; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type team_member; + typedef Kokkos::View > tmp_scratch_type; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + /*const*/ int num_points = wgb.extent(2); + /*const*/ int num_dim = wgb.extent(3); + + const size_t bytes = 2*tmp_scratch_type::shmem_size(); + policy_type policy(num_cells,num_basis,32); + Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerThread(bytes)), + KOKKOS_LAMBDA (const team_member& team) + { + tmp_scratch_type value(team.thread_scratch(0)); + tmp_scratch_type value2(team.thread_scratch(0)); + const size_t cell = team.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_basis), + [&] (const int& basis) + { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N+1), + [&] (const int& k) + { + value(k) = 0; + value2(k) = 0; + }); + for (int qp=0; qp +double time_fad_flat(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check) +{ + typedef Kokkos::View t_4DView_d; + typedef Kokkos::View t_3DView_d; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView_d wgb("",ncells,num_basis,num_points,ndim); + t_3DView_d wbs("",ncells,num_basis,num_points); + t_3DView flux("",ncells,num_points,ndim,N+1); + t_2DView src("",ncells,num_points,N+1); + t_2DView residual("",ncells,num_basis,N+1); + init_fad(wgb, wbs, flux, src, residual); + + // Run once to warm up, complete any UVM transfers + run_fad_flat(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i +double time_fad_scratch(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check) +{ + typedef Kokkos::View t_4DView_d; + typedef Kokkos::View t_3DView_d; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView_d wgb("",ncells,num_basis,num_points,ndim); + t_3DView_d wbs("",ncells,num_basis,num_points); + t_3DView flux("",ncells,num_points,ndim,N+1); + t_2DView src("",ncells,num_points,N+1); + t_2DView residual("",ncells,num_basis,N+1); + init_fad(wgb, wbs, flux, src, residual); + + // Run once to warm up, complete any UVM transfers + run_fad_scratch(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i +double time_analytic_flat(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check) +{ + typedef Kokkos::View t_4DView_d; + typedef Kokkos::View t_3DView_d; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView_d wgb("",ncells,num_basis,num_points,ndim); + t_3DView_d wbs("",ncells,num_basis,num_points); + t_3DView flux("",ncells,num_points,ndim); + t_2DView src("",ncells,num_points); + t_2DView residual("",ncells,num_basis); + init_array(wgb, wbs, flux, src, residual); + + // Run once to warm up, complete any UVM transfers + run_analytic_flat(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i(flux, wgb, src, wbs, residual); + Kokkos::fence(); + double time = timer.seconds() / ntrial / ncells; + + // Check result + if (check) + check_residual(flux, wgb, src, wbs, residual); + + return time; +} + +template +double time_analytic_const(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check) +{ + typedef Kokkos::View t_4DView_d; + typedef Kokkos::View t_3DView_d; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView_d wgb("",ncells,num_basis,num_points,ndim); + t_3DView_d wbs("",ncells,num_basis,num_points); + t_3DView flux("",ncells,num_points,ndim); + t_2DView src("",ncells,num_points); + t_2DView residual("",ncells,num_basis); + init_array(wgb, wbs, flux, src, residual); + + typedef Kokkos::View > t_3DView_const; + t_3DView_const flux_const = flux; + + // Run once to warm up, complete any UVM transfers + run_analytic_flat(flux_const, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i(flux_const, wgb, src, wbs, residual); + Kokkos::fence(); + double time = timer.seconds() / ntrial / ncells; + + // Check result + if (check) + check_residual(flux, wgb, src, wbs, residual); + + return time; +} + +template +double time_analytic_team(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check) +{ + typedef Kokkos::View t_4DView_d; + typedef Kokkos::View t_3DView_d; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView_d wgb("",ncells,num_basis,num_points,ndim); + t_3DView_d wbs("",ncells,num_basis,num_points); + t_3DView flux("",ncells,num_points,ndim); + t_2DView src("",ncells,num_points); + t_2DView residual("",ncells,num_basis); + init_array(wgb, wbs, flux, src, residual); + + typedef Kokkos::View > t_3DView_const; + t_3DView_const flux_const = flux; + + // Run once to warm up, complete any UVM transfers + run_analytic_team(flux_const, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i(flux_const, wgb, src, wbs, residual); + Kokkos::fence(); + double time = timer.seconds() / ntrial / ncells; + + // Check result + if (check) + check_residual(flux, wgb, src, wbs, residual); + + return time; +} + +#define INST_FUNC_FAD_N_DEV(FAD,N,DEV) \ + template double time_fad_flat< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ + template double time_fad_scratch< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); + +#define INST_FUNC_N_DEV(N,DEV) \ + INST_FUNC_FAD_N_DEV(SFadType,N,DEV) \ + INST_FUNC_FAD_N_DEV(DFadType,N,DEV) \ + template double time_analytic_flat< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ + template double time_analytic_const< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ + template double time_analytic_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); + +#define INST_FUNC_DEV(DEV) \ + INST_FUNC_N_DEV( fad_dim, DEV ) + +#ifdef KOKKOS_ENABLE_SERIAL +INST_FUNC_DEV(Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +INST_FUNC_DEV(Kokkos::OpenMP) +#endif + +#ifdef KOKKOS_ENABLE_THREADS +INST_FUNC_DEV(Kokkos::Threads) +#endif + +#ifdef KOKKOS_ENABLE_CUDA +INST_FUNC_DEV(Kokkos::Cuda) +#endif diff --git a/packages/sacado/test/performance/advection_const_basis/advection.hpp b/packages/sacado/test/performance/advection_const_basis/advection.hpp new file mode 100644 index 000000000000..adca588cee11 --- /dev/null +++ b/packages/sacado/test/performance/advection_const_basis/advection.hpp @@ -0,0 +1,50 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#pragma once + +template +double time_fad_flat(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check); + +template +double time_fad_scratch(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check); + +template +double time_analytic_flat(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check); + +template +double time_analytic_const(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check); + +template +double time_analytic_team(int ncells, int num_basis, int num_points, int ndim, + int ntrial, bool check); diff --git a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp new file mode 100644 index 000000000000..e66de586f231 --- /dev/null +++ b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp @@ -0,0 +1,230 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#define SACADO_VIEW_CUDA_HIERARCHICAL 1 +#define SACADO_ALIGN_SFAD 1 + +#include "Sacado.hpp" +#include "advection_hierarchical.hpp" +#include "common.hpp" + +#include "impl/Kokkos_Timer.hpp" + +template +void run_fad_hierarchical_flat(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename Kokkos::ThreadLocalScalarType::type local_scalar_type; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type team_member; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + const bool is_cuda = is_cuda_space::value; + const int vector_size = is_cuda ? 32 : 1; + const int team_size = is_cuda ? 256/vector_size : 1; + const size_t range = (num_cells+team_size-1)/team_size; + + policy_type policy(range,team_size,vector_size); + Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team) + { + const size_t cell = team.league_rank()*team_size + team.team_rank(); + local_scalar_type value, value2; + for (int basis=0; basis +void run_fad_hierarchical_team(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename Kokkos::ThreadLocalScalarType::type local_scalar_type; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type team_member; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + const bool is_cuda = is_cuda_space::value; + const int vector_size = is_cuda ? 32 : 1; + const int team_size = is_cuda ? 256/vector_size : 1; + + policy_type policy(num_cells,team_size,vector_size); + Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team) + { + const int team_rank = team.team_rank(); + const size_t cell = team.league_rank(); + local_scalar_type value, value2; + for (int basis=team_rank; basis +double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points, + int ndim, int ntrial, bool check) +{ + typedef Sacado::Fad::SFad FadType; + + static const int FadStride = is_cuda_space::value ? 32 : 1; +#if defined(SACADO_ALIGN_SFAD) + static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride; + typedef typename FadType::template apply_N::type AlignedFadType; +#else + typedef FadType AlignedFadType; +#endif + + typedef typename ExecSpace::array_layout DefaultLayout; + typedef Kokkos::LayoutContiguous ContLayout; + typedef Kokkos::View t_4DView_d; + typedef Kokkos::View t_3DView_d; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView_d wgb("",ncells,num_basis,num_points,ndim); + t_3DView_d wbs("",ncells,num_basis,num_points); + t_3DView flux("",ncells,num_points,ndim,N+1); + t_2DView src("",ncells,num_points,N+1); + t_2DView residual("",ncells,num_basis,N+1); + init_fad(wgb, wbs, flux, src, residual); + + // Run once to warm up, complete any UVM transfers + run_fad_hierarchical_flat(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i +double time_fad_hierarchical_team(int ncells, int num_basis, int num_points, + int ndim, int ntrial, bool check) +{ + typedef Sacado::Fad::SFad FadType; + + static const int FadStride = is_cuda_space::value ? 32 : 1; +#if defined(SACADO_ALIGN_SFAD) + static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride; + typedef typename FadType::template apply_N::type AlignedFadType; +#else + typedef FadType AlignedFadType; +#endif + + typedef typename ExecSpace::array_layout DefaultLayout; + typedef Kokkos::LayoutContiguous ContLayout; + typedef Kokkos::View t_4DView_d; + typedef Kokkos::View t_3DView_d; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView_d wgb("",ncells,num_basis,num_points,ndim); + t_3DView_d wbs("",ncells,num_basis,num_points); + t_3DView flux("",ncells,num_points,ndim,N+1); + t_2DView src("",ncells,num_points,N+1); + t_2DView residual("",ncells,num_basis,N+1); + init_fad(wgb, wbs, flux, src, residual); + + // Run once to warm up, complete any UVM transfers + run_fad_hierarchical_team(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ + template double time_fad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); + +#define INST_FUNC_DEV(DEV) \ + INST_FUNC_N_DEV( fad_dim, DEV ) + +#ifdef KOKKOS_ENABLE_SERIAL +INST_FUNC_DEV(Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +INST_FUNC_DEV(Kokkos::OpenMP) +#endif + +#ifdef KOKKOS_ENABLE_THREADS +INST_FUNC_DEV(Kokkos::Threads) +#endif + +#ifdef KOKKOS_ENABLE_CUDA +INST_FUNC_DEV(Kokkos::Cuda) +#endif diff --git a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.hpp b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.hpp new file mode 100644 index 000000000000..f78f4316897b --- /dev/null +++ b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.hpp @@ -0,0 +1,38 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#pragma once + +template +double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points, + int ndim, int ntrial, bool check); + +template +double time_fad_hierarchical_team(int ncells, int num_basis, int num_points, + int ndim, int ntrial, bool check); diff --git a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.cpp b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.cpp new file mode 100644 index 000000000000..980e9b6eec82 --- /dev/null +++ b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.cpp @@ -0,0 +1,239 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1 +#define SACADO_KOKKOS_USE_MEMORY_POOL 1 + +#include "Sacado.hpp" +#include "advection_hierarchical_dfad.hpp" +#include "common.hpp" + +#include "impl/Kokkos_Timer.hpp" + +template +void run_dfad_hierarchical_team(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename ResidualView::non_const_value_type scalar_type; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type team_member; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + const bool is_cuda = is_cuda_space::value; + const int vector_size = is_cuda ? 32 : 1; + const int team_size = is_cuda ? 256/vector_size : 1; + + policy_type policy(num_cells,team_size,vector_size); + Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team) + { + const int team_rank = team.team_rank(); + const size_t cell = team.league_rank(); + scalar_type value, value2; + for (int basis=team_rank; basis +void run_dfad_hierarchical_team_scratch( + const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + typedef typename ResidualView::execution_space execution_space; + typedef typename ResidualView::non_const_value_type scalar_type; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type team_member; + typedef Kokkos::View > tmp_scratch_type; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + + const bool is_cuda = is_cuda_space::value; + const int vector_size = is_cuda ? 32 : 1; + const int team_size = is_cuda ? 256/vector_size : 1; + const int fad_size = Kokkos::dimension_scalar(residual); + const size_t bytes = 2*tmp_scratch_type::shmem_size(team_size,fad_size); + policy_type policy(num_cells,team_size,vector_size); + + Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)), + KOKKOS_LAMBDA (const team_member& team) + { + const int team_rank = team.team_rank(); + const size_t cell = team.league_rank(); + tmp_scratch_type value(team.team_scratch(0), team_size, fad_size); + tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size); + for (int basis=team_rank; basis +double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points, + int ndim, int ntrial, bool check) +{ + typedef Sacado::Fad::DFad FadType; + + typedef typename ExecSpace::array_layout DefaultLayout; + typedef Kokkos::LayoutContiguous ContLayout; + typedef Kokkos::View t_4DView_d; + typedef Kokkos::View t_3DView_d; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView_d wgb("",ncells,num_basis,num_points,ndim); + t_3DView_d wbs("",ncells,num_basis,num_points); + t_3DView flux("",ncells,num_points,ndim,N+1); + t_2DView src("",ncells,num_points,N+1); + t_2DView residual("",ncells,num_basis,N+1); + init_fad(wgb, wbs, flux, src, residual); + + // Create memory pool for DFad + // The kernel allocates 2*N double's per warp on Cuda. Approximate + // the maximum number of warps as the maximum concurrency / 32. + // Include a fudge factor of 1.2 since memory pool treats a block as full + // once it reaches 80% capacity + const size_t block_size = N*sizeof(double); + size_t nkernels = ExecSpace::concurrency()*2; + if (is_cuda_space::value) + nkernels /= 32; + const size_t mem_pool_size = static_cast(1.2*nkernels*block_size); + const size_t superblock_size = + std::max(nkernels / 100, 1) * block_size; + ExecSpace exec_space; + Sacado::createGlobalMemoryPool(exec_space, mem_pool_size, + block_size, block_size, superblock_size); + + // Run once to warm up, complete any UVM transfers + run_dfad_hierarchical_team(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i +double time_dfad_hierarchical_team_scratch( + int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check) +{ + typedef Sacado::Fad::DFad FadType; + + typedef typename ExecSpace::array_layout DefaultLayout; + typedef Kokkos::LayoutContiguous ContLayout; + typedef Kokkos::View t_4DView_d; + typedef Kokkos::View t_3DView_d; + typedef Kokkos::View t_3DView; + typedef Kokkos::View t_2DView; + + t_4DView_d wgb("",ncells,num_basis,num_points,ndim); + t_3DView_d wbs("",ncells,num_basis,num_points); + t_3DView flux("",ncells,num_points,ndim,N+1); + t_2DView src("",ncells,num_points,N+1); + t_2DView residual("",ncells,num_basis,N+1); + init_fad(wgb, wbs, flux, src, residual); + + // Run once to warm up, complete any UVM transfers + run_dfad_hierarchical_team_scratch(flux, wgb, src, wbs, residual); + + // Time execution + Kokkos::fence(); + Kokkos::Impl::Timer timer; + for (int i=0; i(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \ + template double time_dfad_hierarchical_team_scratch< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); + +#define INST_FUNC_DEV(DEV) \ + INST_FUNC_N_DEV( fad_dim, DEV ) + +#ifdef KOKKOS_ENABLE_SERIAL +INST_FUNC_DEV(Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +INST_FUNC_DEV(Kokkos::OpenMP) +#endif + +#ifdef KOKKOS_ENABLE_THREADS +INST_FUNC_DEV(Kokkos::Threads) +#endif + +#ifdef KOKKOS_ENABLE_CUDA +INST_FUNC_DEV(Kokkos::Cuda) +#endif diff --git a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.hpp b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.hpp new file mode 100644 index 000000000000..9933d1102f73 --- /dev/null +++ b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.hpp @@ -0,0 +1,39 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#pragma once + +template +double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points, + int ndim, int ntrial, bool check); + +template +double time_dfad_hierarchical_team_scratch(int ncells, int num_basis, + int num_points, int ndim, int ntrial, + bool check); diff --git a/packages/sacado/test/performance/advection_const_basis/common.hpp b/packages/sacado/test/performance/advection_const_basis/common.hpp new file mode 100644 index 000000000000..a7616179f733 --- /dev/null +++ b/packages/sacado/test/performance/advection_const_basis/common.hpp @@ -0,0 +1,348 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#pragma once + +const int fad_dim = 50; +typedef Sacado::Fad::SFad SFadType; +typedef Sacado::Fad::DFad DFadType; + +template +struct is_cuda_space { + static const bool value = false; +}; + +#ifdef KOKKOS_ENABLE_CUDA +template <> +struct is_cuda_space { + static const bool value = true; +}; +#endif + +template +scalar +generate_fad(const size_t n0, const size_t n1, + const size_t n2, const size_t n3, const int fad_size, + const size_t i0, const size_t i1, + const size_t i2, const size_t i3, + const int i_fad) +{ + const scalar x0 = 10.0 + scalar(n0) / scalar(i0+1); + const scalar x1 = 100.0 + scalar(n1) / scalar(i1+1); + const scalar x2 = 1000.0 + scalar(n2) / scalar(i2+1); + const scalar x3 = 10000.0 + scalar(n3) / scalar(i3+1); + const scalar x = x0 + x1 + x2 + x3; + if (i_fad == fad_size) + return x; + const scalar x_fad = 1.0 + scalar(fad_size) / scalar(i_fad+1); + return x + x_fad; +} + +template +void init_fad(const WgbView& wgb, const WbsView& wbs, const FluxView& flux, + const SrcView& src, const ResidualView& residual) +{ + typedef typename ResidualView::non_const_value_type::value_type scalar; + + const int ncells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int ndim = wgb.extent(3); + const int N = Kokkos::dimension_scalar(residual)-1; + + auto wgb_h = Kokkos::create_mirror_view(wgb); + auto wbs_h = Kokkos::create_mirror_view(wbs); + auto flux_h = Kokkos::create_mirror_view(flux); + auto src_h = Kokkos::create_mirror_view(src); + for (int cell=0; cell(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N); + } + wbs_h(cell,basis,qp) = + generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N); + } + } + for (int qp=0; qp(ncells,1,num_points,ndim,N,cell,0,qp,dim,i); + flux_h(cell,qp,dim).val() = + generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,N); + } + for (int i=0; i(ncells,1,num_points,1,N,cell,0,qp,0,i); + src_h(cell,qp).val() = + generate_fad(ncells,1,num_points,1,N,cell,0,qp,0,N); + } + } + + Kokkos::deep_copy( wgb, wgb_h ); + Kokkos::deep_copy( wbs, wbs_h ); + Kokkos::deep_copy( flux, flux_h ); + Kokkos::deep_copy( src, src_h ); + + Kokkos::deep_copy(typename ResidualView::array_type(residual), 0.0); +} + +template +void init_array(const WgbView& wgb, const WbsView& wbs, const FluxView& flux, + const SrcView& src, const ResidualView& residual) +{ + typedef typename ResidualView::non_const_value_type scalar; + + const int ncells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int ndim = wgb.extent(3); + const int N = residual.extent(2)-1; + + auto wgb_h = Kokkos::create_mirror_view(wgb); + auto wbs_h = Kokkos::create_mirror_view(wbs); + auto flux_h = Kokkos::create_mirror_view(flux); + auto src_h = Kokkos::create_mirror_view(src); + for (int cell=0; cell(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N); + } + wbs_h(cell,basis,qp) = + generate_fad(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N); + } + } + for (int qp=0; qp(ncells,1,num_points,ndim,N,cell,0,qp,dim,i); + flux_h(cell,qp,dim,N) = + generate_fad(ncells,1,num_points,ndim,N,cell,0,qp,dim,N); + } + for (int i=0; i(ncells,1,num_points,1,N,cell,0,qp,0,i); + src_h(cell,qp,N) = + generate_fad(ncells,1,num_points,1,N,cell,0,qp,0,N); + } + } + + Kokkos::deep_copy( wgb, wgb_h ); + Kokkos::deep_copy( wbs, wbs_h ); + Kokkos::deep_copy( flux, flux_h ); + Kokkos::deep_copy( src, src_h ); + + Kokkos::deep_copy(residual, 0.0); +} + +template +typename std::enable_if< !Kokkos::is_view_fad::value, bool>::type +check(const View1& v_gold, const View2& v, const double tol) +{ + // Copy to host + typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold); + typename View2::HostMirror v_h = Kokkos::create_mirror_view(v); + Kokkos::deep_copy(v_gold_h, v_gold); + Kokkos::deep_copy(v_h, v); + + typedef typename View1::value_type value_type; + + const size_t n0 = v_gold_h.extent(0); + const size_t n1 = v_gold_h.extent(1); + const size_t n2 = v_gold_h.extent(2); + + bool success = true; + for ( size_t i0 = 0 ; i0 < n0 ; ++i0 ) { + for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { + value_type x_gold = v_gold_h(i0,i1,i2); + value_type x = v_h(i0,i1,i2); + if (std::abs(x_gold-x) > tol*std::abs(x_gold)) { + std::cout << "Comparison failed! x_gold(" + << i0 << "," << i1 << "," << i2 << ") = " + << x_gold << " , x = " << x + << std::endl; + success = false; + } + } + } + } + + return success; +} + +template +typename std::enable_if< Kokkos::is_view_fad::value, bool>::type +check(const View1& v_gold, const View2& v, const double tol) +{ + // Copy to host + typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold); + typename View2::HostMirror v_h = Kokkos::create_mirror_view(v); + Kokkos::deep_copy(v_gold_h, v_gold); + Kokkos::deep_copy(v_h, v); + + typedef typename View1::value_type value_type; + + const size_t n0 = v_gold_h.extent(0); + const size_t n1 = v_gold_h.extent(1); + const size_t n2 = v_gold_h.extent(2); + + bool success = true; + for ( size_t i0 = 0 ; i0 < n0 ; ++i0 ) { + for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { + value_type x_gold = v_gold_h(i0,i1,i2); + value_type x = (i2 == n2-1) ? v_h(i0,i1).val() : v_h(i0,i1).dx(i2); + if (std::abs(x_gold-x) > tol*std::abs(x_gold)) { + std::cout << "Comparison failed! x_gold(" + << i0 << "," << i1 << "," << i2 << ") = " + << x_gold << " , x = " << x + << std::endl; + success = false; + } + } + } + } + + return success; +} + +template +Kokkos::View +compute_gold_residual( + const FluxView& flux, const WgbView& wgb, const SrcView& src, + const WbsView& wbs, + typename std::enable_if< Kokkos::is_view_fad::value>::type* = 0) +{ + typedef typename FluxView::execution_space execution_space; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + const int N = Kokkos::dimension_scalar(flux)-1; + + Kokkos::View residual( + "",num_cells,num_basis,N+1); + + Kokkos::parallel_for(Kokkos::RangePolicy( 0,num_cells ), + KOKKOS_LAMBDA (const size_t cell) + { + double value, value2; + + // Value + for (int basis=0; basis +Kokkos::View +compute_gold_residual( + const FluxView& flux, const WgbView& wgb, const SrcView& src, + const WbsView& wbs, + typename std::enable_if< !Kokkos::is_view_fad::value>::type* = 0) +{ + typedef typename FluxView::execution_space execution_space; + + const size_t num_cells = wgb.extent(0); + const int num_basis = wgb.extent(1); + const int num_points = wgb.extent(2); + const int num_dim = wgb.extent(3); + const int N = flux.extent(3)-1; + + Kokkos::View residual( + "",num_cells,num_basis,N+1); + + Kokkos::parallel_for(Kokkos::RangePolicy( 0,num_cells ), + KOKKOS_LAMBDA (const size_t cell) + { + double value, value2; + for (int k=0; k<=N; ++k) { + for (int basis=0; basis +void check_residual(const FluxView& flux, const WgbView& wgb, + const SrcView& src, const WbsView& wbs, + const ResidualView& residual) +{ + // Generate gold residual + auto residual_gold = compute_gold_residual(flux, wgb, src, wbs); + + // Compare residual and residual_gold + const double tol = 1.0e-14; + check(residual_gold, residual, tol); +} diff --git a/packages/sacado/test/performance/advection_const_basis/driver.cpp b/packages/sacado/test/performance/advection_const_basis/driver.cpp new file mode 100644 index 000000000000..1812cc80ed0a --- /dev/null +++ b/packages/sacado/test/performance/advection_const_basis/driver.cpp @@ -0,0 +1,164 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +// A performance test that computes the derivative of a simple Kokkos kernel +// using various Fad classes + +#include "Sacado.hpp" + +#include "advection.hpp" +#include "advection_hierarchical.hpp" +#include "advection_hierarchical_dfad.hpp" +#include "common.hpp" + +#include "Teuchos_CommandLineProcessor.hpp" +#include "Teuchos_StandardCatchMacros.hpp" + +template +void run(const int cell_begin, const int cell_end, const int cell_step, + const int nbasis, const int npoint, const int ntrial, const bool check) +{ + const int ndim = 3; + printf("ncell %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", "flat sfad", "flat dfad", "dfad sc", "analytic", "const", "team", "hier sfad", "hier dfad", "h dfad sc"); + for(int i=cell_begin; i<=cell_end; i+=cell_step) { + double sfad_flat = time_fad_flat( + i,nbasis,npoint,ndim,ntrial,check); + double dfad_flat = time_fad_flat( + i,nbasis,npoint,ndim,ntrial,check); + double dfad_scratch = time_fad_scratch( + i,nbasis,npoint,ndim,ntrial,check); + double analytic = time_analytic_flat( + i,nbasis,npoint,ndim,ntrial,check); + double analytic_const = time_analytic_const( + i,nbasis,npoint,ndim,ntrial,check); + double analytic_team = time_analytic_team( + i,nbasis,npoint,ndim,ntrial,check); + double sfad_hierarchical = time_fad_hierarchical_team( + i,nbasis,npoint,ndim,ntrial,check); + double dfad_hierarchical = time_dfad_hierarchical_team( + i,nbasis,npoint,ndim,ntrial,check); + double dfad_hierarchical_scratch = + time_dfad_hierarchical_team_scratch( + i,nbasis,npoint,ndim,ntrial,check); + printf("%5d %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e\n",i,sfad_flat,dfad_flat,dfad_scratch,analytic,analytic_const,analytic_team,sfad_hierarchical,dfad_hierarchical,dfad_hierarchical_scratch); + } +} + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc,argv); + + bool success = true; + try { + + // Set up command line options + Teuchos::CommandLineProcessor clp(false); + clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel"); +#ifdef KOKKOS_ENABLE_SERIAL + bool serial = 0; + clp.setOption("serial", "no-serial", &serial, "Whether to run Serial"); +#endif +#ifdef KOKKOS_ENABLE_OPENMP + bool openmp = 0; + clp.setOption("openmp", "no-openmp", &openmp, "Whether to run OpenMP"); +#endif +#ifdef KOKKOS_ENABLE_THREADS + bool threads = 0; + clp.setOption("threads", "no-threads", &threads, "Whether to run Threads"); +#endif +#ifdef KOKKOS_ENABLE_CUDA + bool cuda = 0; + clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA"); +#endif + bool print_config = false; + clp.setOption("print-config", "no-print-config", &print_config, + "Whether to print Kokkos device configuration"); + int cell_begin = 100; + clp.setOption("begin", &cell_begin, "Starting number of cells"); + int cell_end = 8000; + clp.setOption("end", &cell_end, "Ending number of cells"); + int cell_step = 100; + clp.setOption("step", &cell_step, "Cell increment"); + int nbasis = 8; + clp.setOption("basis", &nbasis, "Number of basis functions"); + int npoint = 8; + clp.setOption("point", &npoint, "Number of integration points"); + int ntrial = 5; + clp.setOption("trial", &ntrial, "Number of trials"); + bool check = false; + clp.setOption("check", "no-check", &check, + "Check correctness of results"); + + // Parse options + switch (clp.parse(argc, argv)) { + case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED: + return 0; + case Teuchos::CommandLineProcessor::PARSE_ERROR: + case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: + return 1; + case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL: + break; + } + + if (print_config) + Kokkos::print_configuration(std::cout, true); + +#ifdef KOKKOS_ENABLE_SERIAL + if (serial) { + using Kokkos::Serial; + run(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check); + } +#endif + +#ifdef KOKKOS_ENABLE_OPENMP + if (openmp) { + using Kokkos::OpenMP; + run(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check); + } +#endif + +#ifdef KOKKOS_ENABLE_THREADS + if (threads) { + using Kokkos::Threads; + run(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check); + } +#endif + +#ifdef KOKKOS_ENABLE_CUDA + if (cuda) { + using Kokkos::Cuda; + run(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check); + } +#endif + } + TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success); + + Kokkos::finalize(); + + return !success; +} diff --git a/packages/sacado/test/performance/mat_vec/CMakeLists.txt b/packages/sacado/test/performance/mat_vec/CMakeLists.txt new file mode 100644 index 000000000000..34a887346a65 --- /dev/null +++ b/packages/sacado/test/performance/mat_vec/CMakeLists.txt @@ -0,0 +1,22 @@ +ASSERT_DEFINED(PACKAGE_SOURCE_DIR CMAKE_CURRENT_SOURCE_DIR) + +INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) + +IF (Sacado_ENABLE_KokkosCore AND Sacado_ENABLE_TeuchosCore) + + IF(NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.8"))) + + TRIBITS_ADD_EXECUTABLE( + FadMatVecTest + SOURCES common.hpp + mat_vec.hpp mat_vec.cpp + mat_vec_hierarchical.hpp mat_vec_hierarchical.cpp + mat_vec_hierarchical_dfad.hpp mat_vec_hierarchical_dfad.cpp + driver.cpp + COMM serial mpi + ) + + ENDIF() + +ENDIF() diff --git a/packages/sacado/test/performance/mat_vec/common.hpp b/packages/sacado/test/performance/mat_vec/common.hpp new file mode 100644 index 000000000000..246efd24e5d1 --- /dev/null +++ b/packages/sacado/test/performance/mat_vec/common.hpp @@ -0,0 +1,45 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#pragma once + +//#define SACADO_DISABLE_FAD_VIEW_SPEC + +#include "impl/Kokkos_Timer.hpp" + +struct Perf { + double time; + double flops; + double throughput; +}; + +const int SFadSize = 32; +const int SLFadSize = SFadSize; +const int HierSFadSize = 32; +const int HierSLFadSize = HierSFadSize; diff --git a/packages/sacado/test/performance/mat_vec/driver.cpp b/packages/sacado/test/performance/mat_vec/driver.cpp new file mode 100644 index 000000000000..d7a39362e7f7 --- /dev/null +++ b/packages/sacado/test/performance/mat_vec/driver.cpp @@ -0,0 +1,386 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +// A performance test that computes the derivative of a simple Kokkos kernel +// using various Fad classes + +#include "mat_vec.hpp" +#include "mat_vec_hierarchical.hpp" +#include "mat_vec_hierarchical_dfad.hpp" + +#include "Sacado.hpp" + +#include "Teuchos_CommandLineProcessor.hpp" +#include "Teuchos_StandardCatchMacros.hpp" + +// For vtune +#include +#include +#include + +void +print_perf(const Perf& perf, const Perf& perf_base, const size_t p, + const std::string& name) +{ + std::cout << name << "\t " + << perf.time << "\t " + << perf.throughput << "\t " + << perf.time / perf_base.time + << std::endl; +} + +template +void +do_times(const size_t m, + const size_t n, + const size_t p, + const size_t nloop, + const bool value, + const bool analytic, + const bool sfad, + const bool slfad, + const bool dfad, + const bool flat, + const bool hierarchical, + const bool check) +{ + Perf perf_value; + perf_value.time = 1.0; + + // Run value + if (value) { + try { + Perf perf = do_time_val(m,n,nloop,check); + perf_value = perf; + print_perf(perf, perf_value, p, "Value "); + } + catch(std::exception& e) { + std::cout << e.what() << std::endl; + } + } + + // Run analytic + if (analytic) { + try { + Perf perf = + do_time_analytic(m,n,p,nloop,check); + print_perf(perf, perf_value, p, "Analytic "); + } + catch(std::exception& e) { + std::cout << e.what() << std::endl; + } + } + if(analytic && p == SFadSize) { + try { + Perf perf = + do_time_analytic_s(m,n,nloop,check); + print_perf(perf, perf_value, p, "Analytic-s"); + } + catch(std::exception& e) { + std::cout << e.what() << std::endl; + } + } + if(analytic && p <= SLFadSize) { + try { + Perf perf = + do_time_analytic_sl(m,n,p,nloop,check); + print_perf(perf, perf_value, p, "Analytic-sl"); + } + catch(std::exception& e) { + std::cout << e.what() << std::endl; + } + } + + // Run flat SFad + if (flat && sfad && p == SFadSize) { + try { + Perf perf = + do_time_fad, ViewArgs...>(m,n,p,nloop,check); + print_perf(perf, perf_value, p, "SFad "); + } + catch(std::exception& e) { + std::cout << e.what() << std::endl; + } + } + + // Run flat SLFad + if (flat && slfad && p <= SLFadSize) { + try { + Perf perf = + do_time_fad, ViewArgs...>(m,n,p,nloop,check); + print_perf(perf, perf_value, p, "SLFad "); + } + catch(std::exception& e) { + std::cout << e.what() << std::endl; + } + } + + // Run flat DFad + if (flat && dfad) { + try { + Perf perf = + do_time_fad, ViewArgs...>(m,n,p,nloop,check); + print_perf(perf, perf_value, p, "DFad "); + } + catch(std::exception& e) { + std::cout << e.what() << std::endl; + } + try { + Perf perf_scratch = + do_time_scratch, ViewArgs...>(m,n,p,nloop,check); + print_perf(perf_scratch, perf_value, p, "DFad Scratch"); + } + catch(std::exception& e) { + std::cout << e.what() << std::endl; + } + } + + // Run hierarchical SFad + if (hierarchical && sfad && p == HierSFadSize) { + try { + Perf perf = + do_time_fad_hierarchical, ViewArgs...>(m,n,p,nloop,check); + print_perf(perf, perf_value, p, "H. SFad "); + } + catch(std::exception& e) { + std::cout << e.what() << std::endl; + } + } + + // Run hierarchical SLFad + if (hierarchical && slfad && p <= HierSLFadSize) { + try { + Perf perf = + do_time_fad_hierarchical, ViewArgs...>(m,n,p,nloop,check); + print_perf(perf, perf_value, p, "H. SLFad "); + } + catch(std::exception& e) { + std::cout << e.what() << std::endl; + } + } + + // Run hierarchical DFad + if (hierarchical && dfad) { + try { + Perf perf = + do_time_fad_hierarchical_dfad, ViewArgs...>(m,n,p,nloop,check); + print_perf(perf, perf_value, p, "H. DFad "); + } + catch(std::exception& e) { + std::cout << e.what() << std::endl; + } + try { + Perf perf_scratch = + do_time_fad_hierarchical_dfad_scratch, ViewArgs...>(m,n,p,nloop,check); + print_perf(perf_scratch, perf_value, p, "H. DFad Scratch"); + } + catch(std::exception& e) { + std::cout << e.what() << std::endl; + } + } + +} + +enum LayoutType { + LAYOUT_LEFT=0, + LAYOUT_RIGHT, + LAYOUT_DEFAULT +}; +const int num_layout_types = 3; +const LayoutType layout_values[] = { + LAYOUT_LEFT, LAYOUT_RIGHT, LAYOUT_DEFAULT }; +const char *layout_names[] = { "left", "right", "default" }; + +template +void +do_times_layout(const size_t m, + const size_t n, + const size_t p, + const size_t nloop, + const bool value, + const bool analytic, + const bool sfad, + const bool slfad, + const bool dfad, + const bool flat, + const bool hierarchical, + const bool check, + const LayoutType& layout, + const std::string& device) +{ + int prec = 2; + std::cout.setf(std::ios::scientific); + std::cout.precision(prec); + std::cout << std::endl + << device + << " performance for layout " + << layout_names[layout] + << " m = " << m << " n = " << n << " p = " << p + << std::endl << std::endl; + std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl; + + if (layout == LAYOUT_LEFT) + do_times( + m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check); + else if (layout == LAYOUT_RIGHT) + do_times( + m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check); + else + do_times + (m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check); +} + +// Connect executable to vtune for profiling +void connect_vtune() { + std::stringstream cmd; + pid_t my_os_pid=getpid(); + const std::string vtune_loc = + "amplxe-cl"; + const std::string output_dir = "./vtune"; + cmd << vtune_loc + << " -collect hotspots -result-dir " << output_dir + << " -target-pid " << my_os_pid << " &"; + std::cout << cmd.str() << std::endl; + system(cmd.str().c_str()); + system("sleep 10"); +} + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc,argv); + + bool success = true; + try { + + // Set up command line options + Teuchos::CommandLineProcessor clp(false); + clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel"); + int m = 100000; + clp.setOption("m", &m, "Number of matrix rows"); + int n = 100; + clp.setOption("n", &n, "Number of matrix columns"); + int p = SFadSize; + clp.setOption("p", &p, "Number of derivative components"); + int nloop = 10; + clp.setOption("nloop", &nloop, "Number of loops"); +#ifdef KOKKOS_ENABLE_SERIAL + bool serial = 0; + clp.setOption("serial", "no-serial", &serial, "Whether to run Serial"); +#endif +#ifdef KOKKOS_ENABLE_OPENMP + bool openmp = 0; + clp.setOption("openmp", "no-openmp", &openmp, "Whether to run OpenMP"); +#endif +#ifdef KOKKOS_ENABLE_THREADS + bool threads = 0; + clp.setOption("threads", "no-threads", &threads, "Whether to run Threads"); +#endif +#ifdef KOKKOS_ENABLE_CUDA + bool cuda = 0; + clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA"); +#endif + bool print_config = false; + clp.setOption("print-config", "no-print-config", &print_config, + "Whether to print Kokkos device configuration"); + LayoutType layout = LAYOUT_DEFAULT; + clp.setOption("layout", &layout, num_layout_types, layout_values, + layout_names, "View layout"); + bool vtune = false; + clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune"); + bool value = true; + clp.setOption("value", "no-value", &value, "Run value calculation"); + bool analytic = true; + clp.setOption("analytic", "no-analytic", &analytic, + "Run analytic derivative calculation"); + bool sfad = true; + clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation"); + bool slfad = true; + clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation"); + bool dfad = true; + clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation"); + bool flat = true; + clp.setOption("flat", "no-flat", &flat, "Run flat Fad derivative calculation"); + bool hierarchical = true; + clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation"); + bool check = false; + clp.setOption("check", "no-check", &check, "Check calculations are correct"); + + // Parse options + switch (clp.parse(argc, argv)) { + case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED: + return 0; + case Teuchos::CommandLineProcessor::PARSE_ERROR: + case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: + return 1; + case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL: + break; + } + + if (vtune) + connect_vtune(); + + if (print_config) + Kokkos::print_configuration(std::cout, true); + +#ifdef KOKKOS_ENABLE_SERIAL + if (serial) { + do_times_layout( + m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Serial"); + } +#endif + +#ifdef KOKKOS_ENABLE_OPENMP + if (openmp) { + do_times_layout( + m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"OpenMP"); + } +#endif + +#ifdef KOKKOS_ENABLE_THREADS + if (threads) { + do_times_layout( + m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Threads"); + } +#endif + +#ifdef KOKKOS_ENABLE_CUDA + if (cuda) { + do_times_layout( + m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Cuda"); + } +#endif + + } + TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success); + + Kokkos::finalize(); + + return !success; +} diff --git a/packages/sacado/test/performance/mat_vec/fad_kokkos_mat_vec_perf.cpp b/packages/sacado/test/performance/mat_vec/fad_kokkos_mat_vec_perf.cpp new file mode 100644 index 000000000000..e5c49429bb37 --- /dev/null +++ b/packages/sacado/test/performance/mat_vec/fad_kokkos_mat_vec_perf.cpp @@ -0,0 +1,680 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#define SACADO_VIEW_CUDA_HIERARCHICAL 1 +//#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1 +//#define SACADO_KOKKOS_USE_MEMORY_POOL 1 +#define SACADO_ALIGN_SFAD 1 + +//#define SACADO_DISABLE_FAD_VIEW_SPEC +#include "Sacado.hpp" + +#include "Teuchos_CommandLineProcessor.hpp" +#include "Teuchos_StandardCatchMacros.hpp" +#include "Teuchos_Time.hpp" + +#include "impl/Kokkos_Timer.hpp" + +// For vtune +#include +#include +#include + +// A performance test that computes the derivative of a simple Kokkos kernel +// using various Fad classes + +template +void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) { + typedef typename ViewTypeC::value_type scalar_type; + typedef typename ViewTypeC::execution_space execution_space; + + const int m = A.extent(0); + const int n = A.extent(1); + Kokkos::parallel_for( + Kokkos::RangePolicy( 0,m ), + KOKKOS_LAMBDA (const int i) { + scalar_type t = 0.0; + for (int j=0; j +void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b, + const ViewTypeC& c) { + typedef typename ViewTypeC::value_type scalar_type; + typedef typename ViewTypeC::execution_space execution_space; + +#if defined (KOKKOS_ENABLE_CUDA) + const bool is_cuda = std::is_same::value; +#else + const bool is_cuda = false; +#endif + const unsigned vector_size = is_cuda ? 32 : 1; + const unsigned team_size = is_cuda ? 128 / vector_size : 1; + + const int m = A.extent(0); + const int n = A.extent(1); + const int range = (m+team_size-1)/team_size; + + typedef Kokkos::TeamPolicy Policy; + Kokkos::parallel_for( + Policy( range,team_size,vector_size ), + KOKKOS_LAMBDA (const typename Policy::member_type& team) { + const int i = team.league_rank()*team.team_size() + team.team_rank(); + if (i >= m) + return; + + scalar_type t = 0.0; + for (int j=0; j +void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b, + const ViewTypeC& c) { + typedef typename Kokkos::ThreadLocalScalarType::type scalar_type; + typedef typename ViewTypeC::execution_space execution_space; + +#if defined (KOKKOS_ENABLE_CUDA) + const bool is_cuda = std::is_same::value; +#else + const bool is_cuda = false; +#endif + const unsigned vector_size = is_cuda ? 32 : 1; + const unsigned team_size = is_cuda ? 128 / vector_size : 1; + + const int m = A.extent(0); + const int n = A.extent(1); + const int range = (m+team_size-1)/team_size; + + typedef Kokkos::TeamPolicy Policy; + Kokkos::parallel_for( + Policy( range,team_size,vector_size ), + KOKKOS_LAMBDA (const typename Policy::member_type& team) { + const int i = team.league_rank()*team.team_size() + team.team_rank(); + if (i >= m) + return; + + scalar_type t = 0.0; + for (int j=0; j +void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b, + const ViewTypeC& c) { + typedef typename ViewTypeC::value_type scalar_type; + typedef typename ViewTypeC::execution_space execution_space; + +#if defined (KOKKOS_ENABLE_CUDA) + const bool is_cuda = std::is_same::value; +#else + const bool is_cuda = false; +#endif + const unsigned vector_size = 1; + const unsigned team_size = is_cuda ? 128 / vector_size : 1; + + const int m = A.extent(0); + const int n = A.extent(1); + const int range = (m+team_size-1)/team_size; + + typedef Kokkos::TeamPolicy Policy; + Kokkos::parallel_for( + Policy( range,team_size,vector_size ), + KOKKOS_LAMBDA (const typename Policy::member_type& team) { + const int i = team.league_rank()*team.team_size() + team.team_rank(); + if (i >= m) + return; + + scalar_type t = 0.0; + for (int j=0; j +void +check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) +{ + const double tol = 1.0e-14; + typedef typename ViewTypeC::value_type value_type; + typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c); + Kokkos::deep_copy(h_c, c); + const size_t m = A.extent(0); + const size_t n = A.extent(1); + for (size_t i=0; i tol) { + std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t + << std::endl; + } + } +} + +template +void +check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) +{ + const double tol = 1.0e-14; + typedef typename ViewTypeC::value_type value_type; + typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c); + Kokkos::deep_copy(h_c, c); + const size_t m = A.extent(0); + const size_t n = A.extent(1); + const size_t p = Kokkos::dimension_scalar(A); + for (size_t i=0; i tol) { + std::cout << "Comparison failed! " << i << "," << j << " : " + << h_c(i).fastAccessDx(j) << " , " << t << std::endl; + } + } + } +} + +struct Perf { + double time; + double flops; + double throughput; +}; + +template +Perf +do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, + const bool check) +{ + typedef Kokkos::View ViewTypeA; + typedef Kokkos::View ViewTypeB; + typedef Kokkos::View ViewTypeC; + typedef typename ViewTypeA::execution_space execution_space; + + ViewTypeA A("A",m,n,p+1); + ViewTypeB b("B",n,p+1); + ViewTypeC c("c",m,p+1); + + FadType a(p, 1.0); + for (size_t k=0; k +Perf +do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, + const size_t nloop, const bool check) +{ + typedef Kokkos::View ViewTypeA; + typedef Kokkos::View ViewTypeB; + typedef Kokkos::View ViewTypeC; + typedef typename ViewTypeA::execution_space execution_space; + +#if defined (KOKKOS_ENABLE_CUDA) + const bool is_cuda = std::is_same::value; +#else + const bool is_cuda = false; +#endif + +#if defined(SACADO_VIEW_CUDA_HIERARCHICAL) + const int FadStride = is_cuda ? 32 : 1; +#if defined(SACADO_ALIGN_SFAD) + const int N = Sacado::StaticSize::value; + const int Nalign = ((N+FadStride-1)/FadStride)*FadStride; + const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p; + typedef typename FadType::template apply_N::type AlignedFadType; +#else + typedef FadType AlignedFadType; + const size_t pa = p; +#endif +#else + const int FadStride = 1; + typedef FadType AlignedFadType; + const size_t pa = p; +#endif + +#if defined(SACADO_VIEW_CUDA_HIERARCHICAL) || defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) + typedef Kokkos::LayoutContiguous ConLayoutA; + typedef Kokkos::LayoutContiguous ConLayoutB; + typedef Kokkos::LayoutContiguous ConLayoutC; +#else + typedef typename ViewTypeA::array_layout ConLayoutA; + typedef typename ViewTypeB::array_layout ConLayoutB; + typedef typename ViewTypeC::array_layout ConLayoutC; + (void) FadStride; +#endif + + + typedef Kokkos::View ConViewTypeA; + typedef Kokkos::View ConViewTypeB; + typedef Kokkos::View ConViewTypeC; + + ConViewTypeA A("A",m,n,pa+1); + ConViewTypeB b("B",n,pa+1); + ConViewTypeC c("c",m,pa+1); + + AlignedFadType a(pa, 1.0); + for (size_t k=0; k(1.2*nkernels*block_size); + const size_t superblock_size = std::max(nkernels / 100, 1) * block_size; + execution_space space; + Sacado::createGlobalMemoryPool(space, mem_pool_size, + block_size, + block_size, + superblock_size + ); +#endif + + // Execute the kernel once to warm up + run_mat_vec_hierarchical( A, b, c ); + execution_space().fence(); + + wall_clock.reset(); + for (size_t l=0; l +Perf +do_time_val(const size_t m, const size_t n, const size_t nloop, + const bool check) +{ + typedef Kokkos::View ViewTypeA; + typedef Kokkos::View ViewTypeB; + typedef Kokkos::View ViewTypeC; + typedef typename ViewTypeA::execution_space execution_space; + + ViewTypeA A("A",m,n); + ViewTypeB b("B",n); + ViewTypeC c("c",m); + + Kokkos::deep_copy(A, 1.0); + Kokkos::deep_copy(b, 1.0); + + Kokkos::Impl::Timer wall_clock; + Perf perf; + + // Execute the kernel once to warm up + run_mat_vec( A, b, c ); + execution_space().fence(); + + wall_clock.reset(); + for (size_t l=0; l +void +do_times(const size_t m, + const size_t n, + const size_t p, + const size_t ph, + const size_t nloop, + const bool value, + const bool sfad, + const bool slfad, + const bool dfad, + const bool hierarchical, + const bool check) +{ + Perf perf_value; + perf_value.time = 1.0; + + // Run value + if (value) { + Perf perf = do_time_val(m,n,nloop,check); + perf_value = perf; + print_perf(perf, perf_value, p, "Value "); + } + + // Run SFad + if (sfad && p == SFadSize) { + Perf perf = + do_time_fad, ViewArgs...>(m,n,p,nloop,check); + print_perf(perf, perf_value, p, "SFad "); + } + + // Run SLFad + if (slfad && p <= SLFadSize) { + Perf perf = + do_time_fad, ViewArgs...>(m,n,p,nloop,check); + print_perf(perf, perf_value, p, "SLFad "); + } + + // Run DFad + if (dfad) { + Perf perf = + do_time_fad, ViewArgs...>(m,n,p,nloop,check); + print_perf(perf, perf_value, p, "DFad "); + } + + // Run hierarchical + if (hierarchical) { + if (sfad && ph == HierSFadSize) { + Perf perf = + do_time_fad_hierarchical, ViewArgs...>(m,n,ph,nloop,check); + print_perf(perf, perf_value, ph, "Hier SFad "); + } + if (slfad && ph <= HierSLFadSize) { + Perf perf = + do_time_fad_hierarchical, ViewArgs...>(m,n,ph,nloop,check); + print_perf(perf, perf_value, ph, "Hier SLFad"); + } + if (dfad) { + Perf perf = + do_time_fad_hierarchical, ViewArgs...>(m,n,ph,nloop,check); + print_perf(perf, perf_value, ph, "Hier DFad "); + } + } + +} + +enum LayoutType { + LAYOUT_LEFT=0, + LAYOUT_RIGHT, + LAYOUT_DEFAULT +}; +const int num_layout_types = 3; +const LayoutType layout_values[] = { + LAYOUT_LEFT, LAYOUT_RIGHT, LAYOUT_DEFAULT }; +const char *layout_names[] = { "left", "right", "default" }; + +template +void +do_times_layout(const size_t m, + const size_t n, + const size_t p, + const size_t ph, + const size_t nloop, + const bool value, + const bool sfad, + const bool slfad, + const bool dfad, + const bool hierarchical, + const bool check, + const LayoutType& layout, + const std::string& device) +{ + int prec = 2; + std::cout.setf(std::ios::scientific); + std::cout.precision(prec); + std::cout << std::endl + << device + << " performance for layout " + << layout_names[layout] + << " m = " << m << " n = " << n << " p = " << p << " ph = " << ph + << std::endl << std::endl; + std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl; + + if (layout == LAYOUT_LEFT) + do_times( + m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check); + else if (layout == LAYOUT_RIGHT) + do_times( + m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check); + else + do_times + (m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check); +} + +// Connect executable to vtune for profiling +void connect_vtune() { + std::stringstream cmd; + pid_t my_os_pid=getpid(); + const std::string vtune_loc = + "amplxe-cl"; + const std::string output_dir = "./vtune"; + cmd << vtune_loc + << " -collect hotspots -result-dir " << output_dir + << " -target-pid " << my_os_pid << " &"; + std::cout << cmd.str() << std::endl; + system(cmd.str().c_str()); + system("sleep 10"); +} + +//const int SFadSize = 8; +const int SFadSize = 32; +const int SLFadSize = SFadSize; +//const int HierSFadSize = 50; +const int HierSFadSize = 32; +const int HierSLFadSize = HierSFadSize; + +int main(int argc, char* argv[]) { + bool success = true; + try { + + // Set up command line options + Teuchos::CommandLineProcessor clp(false); + clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel"); + int m = 100000; + clp.setOption("m", &m, "Number of matrix rows"); + int n = 100; + clp.setOption("n", &n, "Number of matrix columns"); + int p = SFadSize; + clp.setOption("p", &p, "Number of derivative components"); + int ph = HierSFadSize; + clp.setOption("ph", &ph, "Number of derivative components for hierarchical"); + int nloop = 10; + clp.setOption("nloop", &nloop, "Number of loops"); +#ifdef KOKKOS_ENABLE_SERIAL + bool serial = 0; + clp.setOption("serial", "no-serial", &serial, "Whether to run Serial"); +#endif +#ifdef KOKKOS_ENABLE_OPENMP + int openmp = 0; + clp.setOption("openmp", &openmp, "Number of OpenMP threads"); +#endif +#ifdef KOKKOS_ENABLE_THREADS + int threads = 0; + clp.setOption("threads", &threads, "Number of pThreads threads"); +#endif +#ifdef KOKKOS_ENABLE_CUDA + bool cuda = 0; + clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA"); +#endif + int numa = 0; + clp.setOption("numa", &numa, + "Number of NUMA domains to use (set to 0 to use all NUMAs"); + int cores_per_numa = 0; + clp.setOption("cores-per-numa", &cores_per_numa, + "Number of CPU cores per NUMA to use (set to 0 to use all cores)"); + bool print_config = false; + clp.setOption("print-config", "no-print-config", &print_config, + "Whether to print Kokkos device configuration"); + LayoutType layout = LAYOUT_DEFAULT; + clp.setOption("layout", &layout, num_layout_types, layout_values, + layout_names, "View layout"); + bool vtune = false; + clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune"); + bool value = true; + clp.setOption("value", "no-value", &value, "Run value calculation"); + bool sfad = true; + clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation"); + bool slfad = true; + clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation"); + bool dfad = true; + clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation"); + bool hierarchical = true; + clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation"); + bool check = false; + clp.setOption("check", "no-check", &check, "Check calculations are correct"); + + // Parse options + switch (clp.parse(argc, argv)) { + case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED: + return 0; + case Teuchos::CommandLineProcessor::PARSE_ERROR: + case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: + return 1; + case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL: + break; + } + + if (vtune) + connect_vtune(); + + Kokkos::InitArguments init_args; + init_args.num_threads = cores_per_numa; + init_args.num_numa = numa; + + Kokkos::initialize(init_args); + + if (print_config) + Kokkos::print_configuration(std::cout, true); + +#ifdef KOKKOS_ENABLE_SERIAL + if (serial) { + do_times_layout( + m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Serial"); + } +#endif + +#ifdef KOKKOS_ENABLE_OPENMP + if (openmp) { + do_times_layout( + m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"OpenMP"); + } +#endif + +#ifdef KOKKOS_ENABLE_THREADS + if (threads) { + do_times_layout( + m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Threads"); + } +#endif + +#ifdef KOKKOS_ENABLE_CUDA + if (cuda) { + do_times_layout( + m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Cuda"); + } +#endif + + Kokkos::finalize(); + + } + TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success); + + return !success; +} diff --git a/packages/sacado/test/performance/mat_vec/mat_vec.cpp b/packages/sacado/test/performance/mat_vec/mat_vec.cpp new file mode 100644 index 000000000000..57153a0a9765 --- /dev/null +++ b/packages/sacado/test/performance/mat_vec/mat_vec.cpp @@ -0,0 +1,551 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#include "Sacado.hpp" + +#include "mat_vec.hpp" + +#include "impl/Kokkos_Timer.hpp" + +template +void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) { + typedef typename ViewTypeC::value_type scalar_type; + typedef typename ViewTypeC::execution_space execution_space; + + const int m = A.extent(0); + const int n = A.extent(1); + Kokkos::parallel_for( + Kokkos::RangePolicy( 0,m ), + KOKKOS_LAMBDA (const int i) { + scalar_type t = 0.0; + for (int j=0; j +void +run_mat_vec_scratch(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) +{ + typedef typename ViewTypeC::value_type scalar_type; + typedef typename ViewTypeC::execution_space execution_space; + typedef Kokkos::TeamPolicy Policy; + typedef typename Policy::member_type team_member; + typedef Kokkos::View TmpScratchSpace; + + const int m = A.extent(0); + const int n = A.extent(1); + const int p = dimension_scalar(A); + +#ifdef KOKKOS_ENABLE_CUDA + const bool is_cuda = std::is_same::value; +#else + const bool is_cuda = false; +#endif + const int TeamSize = is_cuda ? 128 : 1; + const int N = (m+TeamSize-1)/TeamSize; + Policy policy(N, TeamSize, 1); + const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p); + Kokkos::parallel_for( + policy.set_scratch_size(0, Kokkos::PerTeam(bytes)), + KOKKOS_LAMBDA (const team_member& team) { + const int team_rank = team.team_rank(); + const int team_size = team.team_size(); + TmpScratchSpace t(team.team_scratch(0), team_size, p); + const int i = team.league_rank()*team_size + team_rank; + if (i < m) { + t(team_rank) = 0.0; + for (int j=0; j +void +run_mat_vec_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) +{ + typedef typename ViewTypeC::execution_space execution_space; + + const int m = A.extent(0); + const int n = A.extent(1); + const int p = A.extent(2)-1; + Kokkos::parallel_for( + Kokkos::RangePolicy( 0,m ), + KOKKOS_LAMBDA (const int i) { + c(i,p) = 0.0; + for (int k=0; k +void +run_mat_vec_deriv_sl(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) +{ + typedef typename ViewTypeC::value_type scalar_type; + typedef typename ViewTypeC::execution_space execution_space; + + const int m = A.extent(0); + const int n = A.extent(1); + const int p = A.extent(2)-1; + Kokkos::parallel_for( + Kokkos::RangePolicy( 0,m ), + KOKKOS_LAMBDA (const int i) { + scalar_type cv = 0.0; + scalar_type t[MaxP]; + for (int k=0; k +void +run_mat_vec_deriv_s(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) +{ + typedef typename ViewTypeC::value_type scalar_type; + typedef typename ViewTypeC::execution_space execution_space; + + const int m = A.extent(0); + const int n = A.extent(1); + Kokkos::parallel_for( + Kokkos::RangePolicy( 0,m ), + KOKKOS_LAMBDA (const int i) { + scalar_type cv = 0.0; + scalar_type t[p]; + for (int k=0; k +void +check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) +{ + const double tol = 1.0e-14; + typedef typename ViewTypeC::value_type value_type; + typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c); + Kokkos::deep_copy(h_c, c); + const size_t m = A.extent(0); + const size_t n = A.extent(1); + for (size_t i=0; i tol) { + std::cout << "Comparison failed! " << i << " : " << h_c(i) << " , " << t + << std::endl; + } + } +} + +template +void +check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) +{ + const double tol = 1.0e-14; + typedef typename ViewTypeC::value_type value_type; + typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c); + Kokkos::deep_copy(h_c, c); + const size_t m = A.extent(0); + const size_t n = A.extent(1); + const size_t p = A.extent(2); + for (size_t i=0; i tol) { + std::cout << "Comparison failed! " << i << "," << j << " : " + << h_c(i,j) << " , " << t << std::endl; + } + } + } +} + +template +Perf +do_time_val(const size_t m, const size_t n, const size_t nloop, + const bool check) +{ + typedef Kokkos::View ViewTypeA; + typedef Kokkos::View ViewTypeB; + typedef Kokkos::View ViewTypeC; + typedef typename ViewTypeA::execution_space execution_space; + + ViewTypeA A("A",m,n); + ViewTypeB b("B",n); + ViewTypeC c("c",m); + + Kokkos::deep_copy(A, 1.0); + Kokkos::deep_copy(b, 1.0); + + Kokkos::Impl::Timer wall_clock; + Perf perf; + + // Execute the kernel once to warm up + run_mat_vec( A, b, c ); + execution_space().fence(); + + wall_clock.reset(); + for (size_t l=0; l +Perf +do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, + const bool check) +{ + typedef Kokkos::View ViewTypeA; + typedef Kokkos::View ViewTypeB; + typedef Kokkos::View ViewTypeC; + typedef typename ViewTypeA::execution_space execution_space; + + // Set amount of memory available for dynamic memory allocation on GPU +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same::value && + std::is_same >::value) { + const size_t concurrency = execution_space::concurrency(); + const size_t mem = std::min(m,concurrency) * p * sizeof(double); + //std::cout << "mem = " << mem / (1024*1024) << " MB" << std::endl; + cudaDeviceSetLimit(cudaLimitMallocHeapSize, mem); + } +#endif + + ViewTypeA A("A",m,n,p+1); + ViewTypeB b("B",n,p+1); + ViewTypeC c("c",m,p+1); + + // FadType a(p, 1.0); + // for (size_t k=0; k +Perf +do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, + const bool check) +{ + typedef Kokkos::View ViewTypeA; + typedef Kokkos::View ViewTypeB; + typedef Kokkos::View ViewTypeC; + typedef typename ViewTypeA::execution_space execution_space; + + ViewTypeA A("A",m,n,p+1); + ViewTypeB b("B",n,p+1); + ViewTypeC c("c",m,p+1); + + // FadType a(p, 1.0); + // for (size_t k=0; k +Perf +do_time_analytic(const size_t m, const size_t n, const size_t p, + const size_t nloop, const bool check) +{ + typedef Kokkos::View ViewTypeA; + typedef Kokkos::View ViewTypeB; + typedef Kokkos::View ViewTypeC; + typedef typename ViewTypeA::execution_space execution_space; + + ViewTypeA A("A",m,n,p+1); + ViewTypeB b("B",n,p+1); + ViewTypeC c("c",m,p+1); + + Kokkos::deep_copy(A, 1.0); + Kokkos::deep_copy(b, 1.0); + + Kokkos::Impl::Timer wall_clock; + Perf perf; + + // Execute the kernel once to warm up + run_mat_vec_deriv( A, b, c ); + execution_space().fence(); + + Teuchos::Time timer("mult", false); + timer.start(true); + for (size_t l=0; l +Perf +do_time_analytic_sl(const size_t m, const size_t n, const size_t p, + const size_t nloop, const bool check) +{ + typedef Kokkos::View ViewTypeA; + typedef Kokkos::View ViewTypeB; + typedef Kokkos::View ViewTypeC; + typedef typename ViewTypeA::execution_space execution_space; + + ViewTypeA A("A",m,n,p+1); + ViewTypeB b("B",n,p+1); + ViewTypeC c("c",m,p+1); + + Kokkos::deep_copy(A, 1.0); + Kokkos::deep_copy(b, 1.0); + + Kokkos::Impl::Timer wall_clock; + Perf perf; + + // Execute the kernel once to warm up + run_mat_vec_deriv_sl( A, b, c ); + execution_space().fence(); + + Teuchos::Time timer("mult", false); + timer.start(true); + for (size_t l=0; l( A, b, c ); + } + execution_space().fence(); + timer.stop(); + + perf.time = wall_clock.seconds() / nloop; + perf.flops = m*n*(2+4*p); + perf.throughput = perf.flops / perf.time / 1.0e9; + + if (check) + check_deriv(A,b,c); + + return perf; +} + +template +Perf +do_time_analytic_s(const size_t m, const size_t n, + const size_t nloop, const bool check) +{ + typedef Kokkos::View ViewTypeA; + typedef Kokkos::View ViewTypeB; + typedef Kokkos::View ViewTypeC; + typedef typename ViewTypeA::execution_space execution_space; + + ViewTypeA A("A",m,n); + ViewTypeB b("B",n,p+1); + ViewTypeC c("c",m,p+1); + + Kokkos::deep_copy(A, 1.0); + Kokkos::deep_copy(b, 1.0); + + Kokkos::Impl::Timer wall_clock; + Perf perf; + + // Execute the kernel once to warm up + run_mat_vec_deriv_s

( A, b, c ); + execution_space().fence(); + + Teuchos::Time timer("mult", false); + timer.start(true); + for (size_t l=0; l( A, b, c ); + } + execution_space().fence(); + timer.stop(); + + perf.time = wall_clock.seconds() / nloop; + perf.flops = m*n*(2+4*p); + perf.throughput = perf.flops / perf.time / 1.0e9; + + if (check) + check_deriv(A,b,c); + + return perf; +} + +typedef Sacado::Fad::SFad SFad_type; +typedef Sacado::Fad::SLFad SLFad_type; +typedef Sacado::Fad::DFad DFad_type; + +#define INST_FUNC_VAL_DEV(DEV) \ + template Perf do_time_val< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \ + template Perf do_time_val< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \ + template Perf do_time_val< DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \ + template Perf do_time_analytic< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \ + template Perf do_time_analytic< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \ + template Perf do_time_analytic< DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \ + template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \ + template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \ + template Perf do_time_analytic_sl< SLFadSize, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \ + template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \ + template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \ + template Perf do_time_analytic_s< SFadSize, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); + +#define INST_FUNC_FAD_DEV(FAD,DEV) \ + template Perf do_time_fad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ + template Perf do_time_fad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ + template Perf do_time_fad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ + template Perf do_time_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ + template Perf do_time_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ + template Perf do_time_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); + +#define INST_FUNC_DEV(DEV) \ + INST_FUNC_VAL_DEV( DEV ) \ + INST_FUNC_FAD_DEV( SFad_type, DEV ) \ + INST_FUNC_FAD_DEV( SLFad_type, DEV ) \ + INST_FUNC_FAD_DEV( DFad_type, DEV ) + +#ifdef KOKKOS_ENABLE_SERIAL +INST_FUNC_DEV(Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +INST_FUNC_DEV(Kokkos::OpenMP) +#endif + +#ifdef KOKKOS_ENABLE_THREADS +INST_FUNC_DEV(Kokkos::Threads) +#endif + +#ifdef KOKKOS_ENABLE_CUDA +INST_FUNC_DEV(Kokkos::Cuda) +#endif diff --git a/packages/sacado/test/performance/mat_vec/mat_vec.hpp b/packages/sacado/test/performance/mat_vec/mat_vec.hpp new file mode 100644 index 000000000000..d5d86d2ea21c --- /dev/null +++ b/packages/sacado/test/performance/mat_vec/mat_vec.hpp @@ -0,0 +1,62 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#pragma once + +#include "common.hpp" + +template +Perf +do_time_val(const size_t m, const size_t n, const size_t nloop, + const bool check); + +template +Perf +do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop, + const bool check); + +template +Perf +do_time_scratch(const size_t m, const size_t n, const size_t p, + const size_t nloop, const bool check); + +template +Perf +do_time_analytic(const size_t m, const size_t n, const size_t p, + const size_t nloop, const bool check); + +template +Perf +do_time_analytic_sl(const size_t m, const size_t n, const size_t p, + const size_t nloop, const bool check); + +template +Perf +do_time_analytic_s(const size_t m, const size_t n, + const size_t nloop, const bool check); diff --git a/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.cpp b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.cpp new file mode 100644 index 000000000000..9a6f0783096f --- /dev/null +++ b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.cpp @@ -0,0 +1,190 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#define SACADO_VIEW_CUDA_HIERARCHICAL 1 +#define SACADO_ALIGN_SFAD 1 + +#include "Sacado.hpp" + +#include "mat_vec_hierarchical.hpp" + +#include "impl/Kokkos_Timer.hpp" + +template +void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b, + const ViewTypeC& c) { + typedef typename Kokkos::ThreadLocalScalarType::type scalar_type; + typedef typename ViewTypeC::execution_space execution_space; + +#if defined (KOKKOS_ENABLE_CUDA) + const bool is_cuda = std::is_same::value; +#else + const bool is_cuda = false; +#endif + const unsigned vector_size = is_cuda ? 32 : 1; + const unsigned team_size = is_cuda ? 128 / vector_size : 1; + + const int m = A.extent(0); + const int n = A.extent(1); + const int range = (m+team_size-1)/team_size; + + typedef Kokkos::TeamPolicy Policy; + Kokkos::parallel_for( + Policy( range,team_size,vector_size ), + KOKKOS_LAMBDA (const typename Policy::member_type& team) { + const int i = team.league_rank()*team.team_size() + team.team_rank(); + if (i >= m) + return; + + scalar_type t = 0.0; + for (int j=0; j +void +check_deriv_hierarchical(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) +{ + const double tol = 1.0e-14; + typedef typename ViewTypeC::value_type value_type; + typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c); + Kokkos::deep_copy(h_c, c); + const size_t m = A.extent(0); + const size_t n = A.extent(1); + const size_t p = Kokkos::dimension_scalar(A); + for (size_t i=0; i tol) { + std::cout << "Comparison failed! " << i << "," << j << " : " + << h_c(i).fastAccessDx(j) << " , " << t << std::endl; + } + } + } +} + +template +Perf +do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, + const size_t nloop, const bool check) +{ + typedef Kokkos::View ViewTypeA; + typedef Kokkos::View ViewTypeB; + typedef Kokkos::View ViewTypeC; + typedef typename ViewTypeA::execution_space execution_space; + +#if defined (KOKKOS_ENABLE_CUDA) + const bool is_cuda = std::is_same::value; +#else + const bool is_cuda = false; +#endif + + const int FadStride = is_cuda ? 32 : 1; +#if defined(SACADO_ALIGN_SFAD) + const int N = Sacado::StaticSize::value; + const int Nalign = ((N+FadStride-1)/FadStride)*FadStride; + const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p; + typedef typename FadType::template apply_N::type AlignedFadType; +#else + typedef FadType AlignedFadType; + const size_t pa = p; +#endif + + typedef Kokkos::LayoutContiguous ConLayoutA; + typedef Kokkos::LayoutContiguous ConLayoutB; + typedef Kokkos::LayoutContiguous ConLayoutC; + + typedef Kokkos::View ConViewTypeA; + typedef Kokkos::View ConViewTypeB; + typedef Kokkos::View ConViewTypeC; + + ConViewTypeA A("A",m,n,pa+1); + ConViewTypeB b("B",n,pa+1); + ConViewTypeC c("c",m,pa+1); + + // AlignedFadType a(pa, 1.0); + // for (size_t k=0; k SFad_type; +typedef Sacado::Fad::SLFad SLFad_type; + +#define INST_FUNC_FAD_DEV(FAD,DEV) \ + template Perf do_time_fad_hierarchical< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ + template Perf do_time_fad_hierarchical< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ + template Perf do_time_fad_hierarchical< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); + +#define INST_FUNC_DEV(DEV) \ + INST_FUNC_FAD_DEV( SFad_type, DEV ) \ + INST_FUNC_FAD_DEV( SLFad_type, DEV ) + +#ifdef KOKKOS_ENABLE_SERIAL +INST_FUNC_DEV(Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +INST_FUNC_DEV(Kokkos::OpenMP) +#endif + +#ifdef KOKKOS_ENABLE_THREADS +INST_FUNC_DEV(Kokkos::Threads) +#endif + +#ifdef KOKKOS_ENABLE_CUDA +INST_FUNC_DEV(Kokkos::Cuda) +#endif diff --git a/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.hpp b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.hpp new file mode 100644 index 000000000000..03d89ab62b68 --- /dev/null +++ b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.hpp @@ -0,0 +1,37 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#pragma once + +#include "common.hpp" + +template +Perf +do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p, + const size_t nloop, const bool check); diff --git a/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.cpp b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.cpp new file mode 100644 index 000000000000..a47fd693f614 --- /dev/null +++ b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.cpp @@ -0,0 +1,285 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1 +#define SACADO_KOKKOS_USE_MEMORY_POOL 1 + +#include "Sacado.hpp" + +#include "mat_vec_hierarchical_dfad.hpp" + +#include "impl/Kokkos_Timer.hpp" + +template +void run_mat_vec_hierarchical_dfad(const ViewTypeA& A, const ViewTypeB& b, + const ViewTypeC& c) { + typedef typename ViewTypeC::value_type scalar_type; + typedef typename ViewTypeC::execution_space execution_space; + +#if defined (KOKKOS_ENABLE_CUDA) + const bool is_cuda = std::is_same::value; +#else + const bool is_cuda = false; +#endif + const unsigned vector_size = is_cuda ? 32 : 1; + const unsigned team_size = is_cuda ? 128 / vector_size : 1; + + const int m = A.extent(0); + const int n = A.extent(1); + const int range = (m+team_size-1)/team_size; + + typedef Kokkos::TeamPolicy Policy; + Kokkos::parallel_for( + Policy( range,team_size,vector_size ), + KOKKOS_LAMBDA (const typename Policy::member_type& team) { + const int i = team.league_rank()*team.team_size() + team.team_rank(); + if (i >= m) + return; + + scalar_type t = 0.0; + for (int j=0; j +void run_mat_vec_hierarchical_dfad_scratch( + const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) { + typedef typename ViewTypeC::value_type scalar_type; + typedef typename ViewTypeC::execution_space execution_space; + typedef Kokkos::TeamPolicy Policy; + typedef typename Policy::member_type team_member; + typedef Kokkos::View TmpScratchSpace; + +#if defined (KOKKOS_ENABLE_CUDA) + const bool is_cuda = std::is_same::value; +#else + const bool is_cuda = false; +#endif + const unsigned VectorSize = is_cuda ? 32 : 1; + const unsigned TeamSize = is_cuda ? 128 / VectorSize : 1; + + const int m = A.extent(0); + const int n = A.extent(1); + const int p = dimension_scalar(A); + const int N = (m+TeamSize-1)/TeamSize; + + Policy policy(N, TeamSize, VectorSize); + const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p); + Kokkos::parallel_for( + policy.set_scratch_size(0, Kokkos::PerTeam(bytes)), + KOKKOS_LAMBDA (const team_member& team) { + const int team_rank = team.team_rank(); + const int team_size = team.team_size(); + TmpScratchSpace t(team.team_scratch(0), team_size, p); + const int i = team.league_rank()*team_size + team_rank; + if (i < m) { + t(team_rank) = 0.0; + for (int j=0; j +void +check_deriv_hierarchical_dfad(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) +{ + const double tol = 1.0e-14; + typedef typename ViewTypeC::value_type value_type; + typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c); + Kokkos::deep_copy(h_c, c); + const size_t m = A.extent(0); + const size_t n = A.extent(1); + const size_t p = Kokkos::dimension_scalar(A); + for (size_t i=0; i tol) { + std::cout << "Comparison failed! " << i << "," << j << " : " + << h_c(i).fastAccessDx(j) << " , " << t << std::endl; + } + } + } +} + +template +Perf +do_time_fad_hierarchical_dfad(const size_t m, const size_t n, const size_t p, + const size_t nloop, const bool check) +{ + typedef Kokkos::View ViewTypeA; + typedef Kokkos::View ViewTypeB; + typedef Kokkos::View ViewTypeC; + typedef typename ViewTypeA::execution_space execution_space; + typedef Kokkos::LayoutContiguous ConLayoutA; + typedef Kokkos::LayoutContiguous ConLayoutB; + typedef Kokkos::LayoutContiguous ConLayoutC; + typedef Kokkos::View ConViewTypeA; + typedef Kokkos::View ConViewTypeB; + typedef Kokkos::View ConViewTypeC; + + ConViewTypeA A("A",m,n,p+1); + ConViewTypeB b("B",n,p+1); + ConViewTypeC c("c",m,p+1); + + // FadType a(p, 1.0); + // for (size_t k=0; k::value; +#else + const bool is_cuda = false; +#endif + const size_t concurrency = execution_space::concurrency(); + const size_t warp_dim = is_cuda ? 32 : 1; + const size_t block_size = p*sizeof(double); + const size_t nkernels = concurrency / warp_dim; + const size_t mem_pool_size = + static_cast(1.2*nkernels*block_size); + const size_t superblock_size = std::max(nkernels / 100, 1) * block_size; + execution_space space; + Sacado::createGlobalMemoryPool(space, mem_pool_size, + block_size, + block_size, + superblock_size + ); + + // Execute the kernel once to warm up + run_mat_vec_hierarchical_dfad( A, b, c ); + execution_space().fence(); + + wall_clock.reset(); + for (size_t l=0; l +Perf +do_time_fad_hierarchical_dfad_scratch( + const size_t m, const size_t n, const size_t p, const size_t nloop, + const bool check) +{ + typedef Kokkos::View ViewTypeA; + typedef Kokkos::View ViewTypeB; + typedef Kokkos::View ViewTypeC; + typedef typename ViewTypeA::execution_space execution_space; + typedef Kokkos::LayoutContiguous ConLayoutA; + typedef Kokkos::LayoutContiguous ConLayoutB; + typedef Kokkos::LayoutContiguous ConLayoutC; + typedef Kokkos::View ConViewTypeA; + typedef Kokkos::View ConViewTypeB; + typedef Kokkos::View ConViewTypeC; + + ConViewTypeA A("A",m,n,p+1); + ConViewTypeB b("B",n,p+1); + ConViewTypeC c("c",m,p+1); + + // FadType a(p, 1.0); + // for (size_t k=0; k DFad_type; + +#define INST_FUNC_FAD_DEV(FAD,DEV) \ + template Perf do_time_fad_hierarchical_dfad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ + template Perf do_time_fad_hierarchical_dfad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ + template Perf do_time_fad_hierarchical_dfad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ + template Perf do_time_fad_hierarchical_dfad_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ + template Perf do_time_fad_hierarchical_dfad_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \ + template Perf do_time_fad_hierarchical_dfad_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); + +#define INST_FUNC_DEV(DEV) \ + INST_FUNC_FAD_DEV( DFad_type, DEV ) + +#ifdef KOKKOS_ENABLE_SERIAL +INST_FUNC_DEV(Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +INST_FUNC_DEV(Kokkos::OpenMP) +#endif + +#ifdef KOKKOS_ENABLE_THREADS +INST_FUNC_DEV(Kokkos::Threads) +#endif + +#ifdef KOKKOS_ENABLE_CUDA +INST_FUNC_DEV(Kokkos::Cuda) +#endif diff --git a/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.hpp b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.hpp new file mode 100644 index 000000000000..f5e6b1c89966 --- /dev/null +++ b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.hpp @@ -0,0 +1,43 @@ +// @HEADER +// *********************************************************************** +// +// Sacado Package +// Copyright (2006) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; either version 2.1 of the +// License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +// USA +// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps +// (etphipp@sandia.gov). +// +// *********************************************************************** +// @HEADER + +#pragma once + +#include "common.hpp" + +template +Perf +do_time_fad_hierarchical_dfad(const size_t m, const size_t n, const size_t p, + const size_t nloop, const bool check); + +template +Perf +do_time_fad_hierarchical_dfad_scratch( + const size_t m, const size_t n, const size_t p, const size_t nloop, + const bool check);