Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tacho - cuSolver interface #6695

Merged
merged 10 commits into from
Feb 3, 2020
16 changes: 8 additions & 8 deletions packages/shylu/shylu_node/tacho/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
TRIBITS_SUBPACKAGE(Tacho)

IF (Kokkos_ENABLE_Cuda)
IF (KOKKOS_ENABLE_CUDA)
# Shylu/Tacho requires CUDA >= 8.0.
IF (DEFINED CUDA_VERSION AND (CUDA_VERSION VERSION_LESS "8.0"))
MESSAGE(FATAL_ERROR "ShyLu/Tacho requires CUDA 8 if CUDA is enabled")
ENDIF()
# If RDC is off, emits a warning message
IF (NOT Kokkos_ENABLE_Cuda_Relocatable_Device_Code)
MESSAGE(WARNING "ShyLu/Tacho requires CUDA relocatable device code to be enabled if CUDA is enabled. Set: Kokkos_ENABLE_Cuda_Relocatable_Device_Code=ON ")
IF (NOT KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
MESSAGE(WARNING "ShyLu/Tacho requires CUDA relocatable device code to be enabled if CUDA is enabled. Set: KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON ")
ENDIF()
ENDIF()
IF (Kokkos_ENABLE_Pthread)
IF (NOT Kokkos_ENABLE_OpenMP)
MESSAGE(FATAL_ERROR "ShyLu/Tacho can not be build with Pthreads as the Kokkos Host Backend.")
IF (KOKKOS_ENABLE_Pthread)
IF (NOT KOKKOS_ENABLE_OPENMP)
MESSAGE(FATAL_ERROR "ShyLu/Tacho can not be build with Pthreads as the KOKKOS Host Backend.")
ENDIF()
ENDIF()

# Set cmake variable to control examples and tests
IF (Kokkos_ENABLE_Cuda)
IF (KOKKOS_ENABLE_CUDA)
IF (DEFINED CUDA_VERSION AND (CUDA_VERSION VERSION_LESS "8.0"))
SET(TACHO_HAVE_KOKKOS_TASK OFF)
ELSE()
IF (Kokkos_ENABLE_Cuda_Relocatable_Device_Code)
IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
SET(TACHO_HAVE_KOKKOS_TASK ON)
ELSE()
SET(TACHO_HAVE_KOKKOS_TASK OFF)
Expand Down
4 changes: 2 additions & 2 deletions packages/shylu/shylu_node/tacho/cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ SET(LIB_OPTIONAL_DEP_PACKAGES TrilinosSS)
SET(TEST_REQUIRED_DEP_PACKAGES Kokkos TrilinosSS Gtest)
SET(TEST_OPTIONAL_DEP_PACKAGES)
SET(LIB_REQUIRED_DEP_TPLS)
SET(LIB_OPTIONAL_DEP_TPLS METIS Scotch Cholmod HWLOC HYPRE MKL LAPACK BLAS Pthread QTHREAD VTune)
SET(LIB_OPTIONAL_DEP_TPLS METIS Scotch Cholmod HWLOC HYPRE MKL LAPACK BLAS Pthread QTHREAD VTune CUSOLVER CUSPARSE CUBLAS CUDA)
SET(TEST_REQUIRED_DEP_TPLS BLAS LAPACK)
SET(TEST_OPTIONAL_DEP_TPLS METIS HWLOC Cholmod MKL LAPACK BLAS Pthread QTHREAD)
SET(TEST_OPTIONAL_DEP_TPLS METIS HWLOC Cholmod MKL LAPACK BLAS Pthread QTHREAD CUSOLVER CUSPARSE CUBLAS CUDA)
13 changes: 12 additions & 1 deletion packages/shylu/shylu_node/tacho/example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,17 @@ IF (TACHO_HAVE_KOKKOS_TASK)
COMM serial mpi
)

#
# NVIDIA cuSolver
#
IF(Kokkos_ENABLE_Cuda)
TRIBITS_ADD_EXECUTABLE(
Tacho_ExampleCholCuSolver
NOEXEPREFIX
SOURCES Tacho_ExampleCholCuSolver.cpp
COMM serial mpi
)
ENDIF()
#
# Intel MKL Pardiso and PerfTest
#
Expand Down Expand Up @@ -219,7 +230,7 @@ IF (TACHO_HAVE_KOKKOS_TASK)
COMM serial mpi
)
ENDIF()

TRIBITS_COPY_FILES_TO_BINARY_DIR(ShyLUTacho_SimpleSparseTest_File
SOURCE_FILES test.mtx
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#include "ShyLU_NodeTacho_config.h"

#include <Kokkos_Core.hpp>
#include <impl/Kokkos_Timer.hpp>

#include "Tacho.hpp"
#include "Tacho_CommandLineParser.hpp"

#if defined (KOKKOS_ENABLE_CUDA)
#include "Tacho_CuSolver.hpp"
#endif

using namespace Tacho;

int main (int argc, char *argv[]) {
CommandLineParser opts("This example program measure the performance of cuSolver on Kokkos::Cuda");

bool verbose = true;
std::string file = "test.mtx";
int nrhs = 1;

opts.set_option<bool>("verbose", "Flag for verbose printing", &verbose);
opts.set_option<std::string>("file", "Input file (MatrixMarket SPD matrix)", &file);
opts.set_option<int>("nrhs", "Number of RHS vectors", &nrhs);

const bool r_parse = opts.parse(argc, argv);
if (r_parse) return 0; // print help return

Kokkos::initialize(argc, argv);

const bool detail = false;

typedef double value_type;

typedef UseThisDevice<Kokkos::Cuda>::device_type device_type;
typedef UseThisDevice<Kokkos::DefaultHostExecutionSpace>::device_type host_device_type;

Tacho::printExecSpaceConfiguration<typename device_type::execution_space>("DeviceSpace", detail);
Tacho::printExecSpaceConfiguration<typename host_device_type::execution_space>("HostSpace", detail);

Kokkos::Impl::Timer timer;

#if defined(KOKKOS_ENABLE_CUDA)
int r_val = 0;
{
///
/// read from crs matrix
///
typedef Tacho::CrsMatrixBase<value_type,host_device_type> CrsMatrixBaseTypeHost;
typedef Tacho::CrsMatrixBase<value_type,device_type> CrsMatrixBaseType;
typedef Kokkos::View<value_type**,Kokkos::LayoutLeft,device_type> DenseMultiVectorType;

/// read a spd matrix of matrix market format
CrsMatrixBaseTypeHost h_A;
{
std::ifstream in;
in.open(file);
if (!in.good()) {
std::cout << "Failed in open the file: " << file << std::endl;
return -1;
}
Tacho::MatrixMarket<value_type>::read(file, h_A, verbose);
}

///
/// cuSolver
///
CuSolver cusolver;
cusolver.setVerbose(verbose);

///
/// reorder matrix
///
#if defined(TACHO_HAVE_METIS)
typedef GraphTools_Metis graph_tools_type;
#else
/// not recommend to use CAMD
typedef GraphTools_CAMD graph_tools_type;
#endif
Graph graph(h_A.NumRows(), h_A.NumNonZeros(), h_A.RowPtr(), h_A.Cols());
graph_tools_type G(graph);
G.reorder(verbose);

const auto h_perm = G.PermVector();
const auto h_peri = G.InvPermVector();

const auto perm = Kokkos::create_mirror_view(typename device_type::memory_space(), h_perm); Kokkos::deep_copy(perm, h_perm);
const auto peri = Kokkos::create_mirror_view(typename device_type::memory_space(), h_peri); Kokkos::deep_copy(peri, h_peri);

CrsMatrixBaseType A;
A.createConfTo(h_A);
A.copy(h_A);

/// permute ondevice
CrsMatrixBaseType Ap;
{
timer.reset();
Ap.createConfTo(A);
Tacho::applyPermutationToCrsMatrixLower(Ap, A, perm, peri);
Kokkos::fence();
const double t_permute_A = timer.seconds();

if (verbose) {
printf("ExampleCuSolver: Construction of permuted matrix A\n");
printf(" Time\n");
printf(" time for permutation of A: %10.6f s\n", t_permute_A);
printf("\n");
}
}

///
/// analyze
///
{
cusolver.analyze(Ap.NumRows(), Ap.RowPtr(), Ap.Cols());
}

///
/// factorize
///
{
cusolver.factorize(Ap.Values());
}

///
/// random right hand side
///
DenseMultiVectorType
b("b", A.NumRows(), nrhs), // rhs multivector
x("x", A.NumRows(), nrhs), // solution multivector
bb("bb", A.NumRows(), nrhs), // temp workspace (store permuted rhs)
xx("t", A.NumRows(), nrhs); // temp workspace (store permuted rhs)

{
Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> random(13718);
Kokkos::fill_random(b, random, value_type(1));
}

///
/// solve
///
{
timer.reset();
applyRowPermutationToDenseMatrix(bb, b, perm);
cusolver.solve(xx, bb);
applyRowPermutationToDenseMatrix(x, xx, peri);
Kokkos::fence();
const double t_solve = timer.seconds();
if (verbose) {
printf("ExampleCuSolver: P b, solve, and P^{-1} x\n");
printf(" Time\n");
printf(" time for permute and solve: %10.6f s\n", t_solve);
printf("\n");
}
}

///
/// compute residual to check solutions
///
const double res = computeRelativeResidual(A, x, b);

std::cout << "cuSolver: residual = " << res << "\n\n";

}
#else
r_val = -1;
std::cout << "CUDA is NOT configured in Trilinos" << std::endl;
#endif

Kokkos::finalize();

return r_val;
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,18 @@ int main (int argc, char *argv[]) {
if (r_parse) return 0; // print help return

const bool skip_factorize = false, skip_solve = false;

typedef Kokkos::DefaultHostExecutionSpace host_space;

Kokkos::initialize(argc, argv);
host_space::print_configuration(std::cout, false);

typedef UseThisDevice<Kokkos::DefaultHostExecutionSpace> host_device_type;
printExecSpaceConfiguration<typename host_device_type::execution_space>("HostDevice", false);

int r_val = 0;
#if defined (__INTEL_MKL__)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should ShyLU instead use a TPL macro? Trilinos already has an MKL TPL.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that makes a difference. INTEL_MKL is defined in mkl.h and mkl.h is guarded by "TACHO_HAVE_MKL" which is determined from TPL_ENABLE_MKL.

{
typedef double value_type;
typedef CrsMatrixBase<value_type,host_space> CrsMatrixBaseType;
typedef Kokkos::View<value_type**,Kokkos::LayoutLeft,host_space> DenseMatrixBaseType;
typedef CrsMatrixBase<value_type,host_device_type> CrsMatrixBaseType;
typedef Kokkos::View<value_type**,Kokkos::LayoutLeft,host_device_type> DenseMatrixBaseType;

// mkl nthreads setting
mkl_set_dynamic(0);
Expand Down Expand Up @@ -104,7 +104,7 @@ int main (int argc, char *argv[]) {
t = timer.seconds();

// 32bit vs 64bit integers; A uses size_t for size array
Kokkos::View<ordinal_type*,host_space> rowptr("rowptr", Asym.NumRows()+1);
Kokkos::View<ordinal_type*,host_device_type> rowptr("rowptr", Asym.NumRows()+1);
{
for (ordinal_type i=0;i<=Asym.NumRows();++i)
rowptr(i) = Asym.RowPtrBegin(i);
Expand Down Expand Up @@ -184,7 +184,7 @@ int main (int argc, char *argv[]) {
}

{
const double res = NumericTools<value_type,host_space>::computeRelativeResidual(A, X, B);
const double res = computeRelativeResidual(A, X, B);
std::cout << "PardisoChol:: residual = " << res << std::endl;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ void testTachoSolver(int numRows,
tachoParams[tacho::PANELSIZE] = 32;
#else
# ifdef KOKKOS_ENABLE_DEPRECATED_CODE
tachoParams[tacho::MAXNUMSUPERBLOCKS] = Kokkos::DefaultHostExecutionSpace::thread_pool_size(0)/2;
tachoParams[tacho::MAXNUMSUPERBLOCKS] = tacho::host_space::thread_pool_size(0)/2;
# else
tachoParams[tacho::MAXNUMSUPERBLOCKS] = Kokkos::DefaultHostExecutionSpace::impl_thread_pool_size(0)/2;
tachoParams[tacho::MAXNUMSUPERBLOCKS] = tacho::host_space::impl_thread_pool_size(0)/2;
# endif
tachoParams[tacho::BLOCKSIZE] = 256;
tachoParams[tacho::PANELSIZE] = 128;
Expand Down Expand Up @@ -83,14 +83,14 @@ void testTachoSolver(int numRows,

/// this example only works for single right hand side
const int NRHS = 1;
typedef Kokkos::View<double**, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace> ViewVectorType;
typedef Kokkos::View<double**, Kokkos::LayoutLeft, tacho::device_type> ViewVectorType;
ViewVectorType x("x", numRows, NRHS);

#if defined (KOKKOS_ENABLE_CUDA)
/// transfer b into device
ViewVectorType b(Kokkos::ViewAllocateWithoutInitializing("b"), numRows, NRHS);
Kokkos::deep_copy(Kokkos::subview(b, Kokkos::ALL(), 0),
Kokkos::View<double*,Kokkos::DefaultHostExecutionSpace>(rhs.data(), numRows));
Kokkos::View<double*,tacho::device_type>(rhs.data(), numRows));
#else
/// wrap rhs data with view
ViewVectorType b(rhs.data(), numRows, NRHS);
Expand Down Expand Up @@ -135,11 +135,11 @@ int main(int argc, char *argv[]) {

Kokkos::initialize(argc, argv);
const bool detail = false;
Tacho::printExecSpaceConfiguration<Kokkos::DefaultExecutionSpace>("DeviceSpace", detail);
Tacho::printExecSpaceConfiguration<Kokkos::DefaultHostExecutionSpace>("HostSpace", detail);
Tacho::printExecSpaceConfiguration<tacho::exec_space>("DeviceSpace", detail);
Tacho::printExecSpaceConfiguration<tacho::host_space>("HostSpace", detail);
{

Tacho::CrsMatrixBase<double,Kokkos::DefaultHostExecutionSpace> A;
Tacho::CrsMatrixBase<double,tacho::host_device_type> A;
{
std::ifstream in;
in.open(file);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,18 @@ namespace tacho {
INDEX_LENGTH
};

typedef Kokkos::View<double*> ViewVectorType;
typedef Kokkos::View<int*> ViewVectorTypeInt;
using device_type = typename Tacho::UseThisDevice<Kokkos::DefaultExecutionSpace>::device_type;
using exec_space = typename device_type::execution_space;

using host_device_type = typename Tacho::UseThisDevice<Kokkos::DefaultHostExecutionSpace>::device_type;
using host_space = typename host_device_type::execution_space;

using ViewVectorType = Kokkos::View<double*,device_type>;
using ViewVectorTypeInt = Kokkos::View<int*,device_type>;

template <class SX> class tachoSolver
{
public:
using exec_space = Kokkos::DefaultExecutionSpace;
using sched_type = Kokkos::TaskSchedulerMultiple<exec_space>;
typedef Tacho::Solver<SX,sched_type> solver_type;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,13 @@ int main (int argc, char *argv[]) {
typedef int ordinal_type;
typedef double value_type;

/// device type
typedef typename Tacho::UseThisDevice<Kokkos::DefaultHostExecutionSpace>::device_type host_device_type;

/// crs matrix format and dense multi vector
typedef Tacho::CrsMatrixBase<value_type,Kokkos::DefaultHostExecutionSpace> CrsMatrixBaseType;
typedef Kokkos::View<value_type**,Kokkos::LayoutLeft,Kokkos::DefaultHostExecutionSpace> DenseMultiVectorType;
//typedef Kokkos::View<ordinal_type*,Kokkos::DefaultHostExecutionSpace> OrdinalTypeArray;
typedef Tacho::CrsMatrixBase<value_type,host_device_type> CrsMatrixBaseType;
typedef Kokkos::View<value_type**,Kokkos::LayoutLeft,host_device_type> DenseMultiVectorType;
//typedef Kokkos::View<ordinal_type*,host_device_type> OrdinalTypeArray;

///
/// problem setting
Expand Down Expand Up @@ -202,7 +205,7 @@ int main (int argc, char *argv[]) {
}

// 32bit vs 64bit integers; A uses size_t for size array
Kokkos::View<ordinal_type*,Kokkos::DefaultHostExecutionSpace> rowptr("rowptr", Asym.NumRows()+1);
Kokkos::View<ordinal_type*,host_device_type> rowptr("rowptr", Asym.NumRows()+1);
for (ordinal_type i=0;i<=Asym.NumRows();++i)
rowptr(i) = Asym.RowPtrBegin(i);

Expand Down Expand Up @@ -265,7 +268,7 @@ int main (int argc, char *argv[]) {
pardiso.showStat(std::cout, Pardiso::Solve) << std::endl;
}

const double res = Tacho::NumericTools<value_type,Kokkos::DefaultHostExecutionSpace>::computeRelativeResidual(A, x, b);
const double res = Tacho::computeRelativeResidual(A, x, b);
std::cout << "PardisoChol:: residual = " << res << "\n\n";

r_val = pardiso.run(Pardiso::ReleaseAll);
Expand Down
Loading