Skip to content

Commit

Permalink
Merge Pull Request #6695 from kyungjoo-kim/Trilinos/tacho-develop
Browse files Browse the repository at this point in the history
Automatically Merged using Trilinos Pull Request AutoTester
PR Title: Tacho - cuSolver interface
PR Author: kyungjoo-kim
  • Loading branch information
trilinos-autotester authored Feb 3, 2020
2 parents a0f0127 + 3196351 commit cab36fd
Show file tree
Hide file tree
Showing 44 changed files with 2,798 additions and 2,083 deletions.
16 changes: 8 additions & 8 deletions packages/shylu/shylu_node/tacho/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
TRIBITS_SUBPACKAGE(Tacho)

IF (Kokkos_ENABLE_Cuda)
IF (KOKKOS_ENABLE_CUDA)
# Shylu/Tacho requires CUDA >= 8.0.
IF (DEFINED CUDA_VERSION AND (CUDA_VERSION VERSION_LESS "8.0"))
MESSAGE(FATAL_ERROR "ShyLu/Tacho requires CUDA 8 if CUDA is enabled")
ENDIF()
# If RDC is off, emits a warning message
IF (NOT Kokkos_ENABLE_Cuda_Relocatable_Device_Code)
MESSAGE(WARNING "ShyLu/Tacho requires CUDA relocatable device code to be enabled if CUDA is enabled. Set: Kokkos_ENABLE_Cuda_Relocatable_Device_Code=ON ")
IF (NOT KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
MESSAGE(WARNING "ShyLu/Tacho requires CUDA relocatable device code to be enabled if CUDA is enabled. Set: KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON ")
ENDIF()
ENDIF()
IF (Kokkos_ENABLE_Pthread)
IF (NOT Kokkos_ENABLE_OpenMP)
MESSAGE(FATAL_ERROR "ShyLu/Tacho can not be build with Pthreads as the Kokkos Host Backend.")
IF (KOKKOS_ENABLE_Pthread)
IF (NOT KOKKOS_ENABLE_OPENMP)
MESSAGE(FATAL_ERROR "ShyLu/Tacho can not be build with Pthreads as the KOKKOS Host Backend.")
ENDIF()
ENDIF()

# Set cmake variable to control examples and tests
IF (Kokkos_ENABLE_Cuda)
IF (KOKKOS_ENABLE_CUDA)
IF (DEFINED CUDA_VERSION AND (CUDA_VERSION VERSION_LESS "8.0"))
SET(TACHO_HAVE_KOKKOS_TASK OFF)
ELSE()
IF (Kokkos_ENABLE_Cuda_Relocatable_Device_Code)
IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
SET(TACHO_HAVE_KOKKOS_TASK ON)
ELSE()
SET(TACHO_HAVE_KOKKOS_TASK OFF)
Expand Down
4 changes: 2 additions & 2 deletions packages/shylu/shylu_node/tacho/cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ SET(LIB_OPTIONAL_DEP_PACKAGES TrilinosSS)
SET(TEST_REQUIRED_DEP_PACKAGES Kokkos TrilinosSS Gtest)
SET(TEST_OPTIONAL_DEP_PACKAGES)
SET(LIB_REQUIRED_DEP_TPLS)
SET(LIB_OPTIONAL_DEP_TPLS METIS Scotch Cholmod HWLOC HYPRE MKL LAPACK BLAS Pthread QTHREAD VTune)
SET(LIB_OPTIONAL_DEP_TPLS METIS Scotch Cholmod HWLOC HYPRE MKL LAPACK BLAS Pthread QTHREAD VTune CUSOLVER CUSPARSE CUBLAS CUDA)
SET(TEST_REQUIRED_DEP_TPLS BLAS LAPACK)
SET(TEST_OPTIONAL_DEP_TPLS METIS HWLOC Cholmod MKL LAPACK BLAS Pthread QTHREAD)
SET(TEST_OPTIONAL_DEP_TPLS METIS HWLOC Cholmod MKL LAPACK BLAS Pthread QTHREAD CUSOLVER CUSPARSE CUBLAS CUDA)
13 changes: 12 additions & 1 deletion packages/shylu/shylu_node/tacho/example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,17 @@ IF (TACHO_HAVE_KOKKOS_TASK)
COMM serial mpi
)

#
# NVIDIA cuSolver
#
IF(Kokkos_ENABLE_Cuda)
TRIBITS_ADD_EXECUTABLE(
Tacho_ExampleCholCuSolver
NOEXEPREFIX
SOURCES Tacho_ExampleCholCuSolver.cpp
COMM serial mpi
)
ENDIF()
#
# Intel MKL Pardiso and PerfTest
#
Expand Down Expand Up @@ -219,7 +230,7 @@ IF (TACHO_HAVE_KOKKOS_TASK)
COMM serial mpi
)
ENDIF()

TRIBITS_COPY_FILES_TO_BINARY_DIR(ShyLUTacho_SimpleSparseTest_File
SOURCE_FILES test.mtx
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}
Expand Down
173 changes: 173 additions & 0 deletions packages/shylu/shylu_node/tacho/example/Tacho_ExampleCholCuSolver.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#include "ShyLU_NodeTacho_config.h"

#include <Kokkos_Core.hpp>
#include <impl/Kokkos_Timer.hpp>

#include "Tacho.hpp"
#include "Tacho_CommandLineParser.hpp"

#if defined (KOKKOS_ENABLE_CUDA)
#include "Tacho_CuSolver.hpp"
#endif

using namespace Tacho;

int main (int argc, char *argv[]) {
CommandLineParser opts("This example program measure the performance of cuSolver on Kokkos::Cuda");

bool verbose = true;
std::string file = "test.mtx";
int nrhs = 1;

opts.set_option<bool>("verbose", "Flag for verbose printing", &verbose);
opts.set_option<std::string>("file", "Input file (MatrixMarket SPD matrix)", &file);
opts.set_option<int>("nrhs", "Number of RHS vectors", &nrhs);

const bool r_parse = opts.parse(argc, argv);
if (r_parse) return 0; // print help return

Kokkos::initialize(argc, argv);

const bool detail = false;

typedef double value_type;

typedef UseThisDevice<Kokkos::Cuda>::device_type device_type;
typedef UseThisDevice<Kokkos::DefaultHostExecutionSpace>::device_type host_device_type;

Tacho::printExecSpaceConfiguration<typename device_type::execution_space>("DeviceSpace", detail);
Tacho::printExecSpaceConfiguration<typename host_device_type::execution_space>("HostSpace", detail);

Kokkos::Impl::Timer timer;

#if defined(KOKKOS_ENABLE_CUDA)
int r_val = 0;
{
///
/// read from crs matrix
///
typedef Tacho::CrsMatrixBase<value_type,host_device_type> CrsMatrixBaseTypeHost;
typedef Tacho::CrsMatrixBase<value_type,device_type> CrsMatrixBaseType;
typedef Kokkos::View<value_type**,Kokkos::LayoutLeft,device_type> DenseMultiVectorType;

/// read a spd matrix of matrix market format
CrsMatrixBaseTypeHost h_A;
{
std::ifstream in;
in.open(file);
if (!in.good()) {
std::cout << "Failed in open the file: " << file << std::endl;
return -1;
}
Tacho::MatrixMarket<value_type>::read(file, h_A, verbose);
}

///
/// cuSolver
///
CuSolver cusolver;
cusolver.setVerbose(verbose);

///
/// reorder matrix
///
#if defined(TACHO_HAVE_METIS)
typedef GraphTools_Metis graph_tools_type;
#else
/// not recommend to use CAMD
typedef GraphTools_CAMD graph_tools_type;
#endif
Graph graph(h_A.NumRows(), h_A.NumNonZeros(), h_A.RowPtr(), h_A.Cols());
graph_tools_type G(graph);
G.reorder(verbose);

const auto h_perm = G.PermVector();
const auto h_peri = G.InvPermVector();

const auto perm = Kokkos::create_mirror_view(typename device_type::memory_space(), h_perm); Kokkos::deep_copy(perm, h_perm);
const auto peri = Kokkos::create_mirror_view(typename device_type::memory_space(), h_peri); Kokkos::deep_copy(peri, h_peri);

CrsMatrixBaseType A;
A.createConfTo(h_A);
A.copy(h_A);

/// permute ondevice
CrsMatrixBaseType Ap;
{
timer.reset();
Ap.createConfTo(A);
Tacho::applyPermutationToCrsMatrixLower(Ap, A, perm, peri);
Kokkos::fence();
const double t_permute_A = timer.seconds();

if (verbose) {
printf("ExampleCuSolver: Construction of permuted matrix A\n");
printf(" Time\n");
printf(" time for permutation of A: %10.6f s\n", t_permute_A);
printf("\n");
}
}

///
/// analyze
///
{
cusolver.analyze(Ap.NumRows(), Ap.RowPtr(), Ap.Cols());
}

///
/// factorize
///
{
cusolver.factorize(Ap.Values());
}

///
/// random right hand side
///
DenseMultiVectorType
b("b", A.NumRows(), nrhs), // rhs multivector
x("x", A.NumRows(), nrhs), // solution multivector
bb("bb", A.NumRows(), nrhs), // temp workspace (store permuted rhs)
xx("t", A.NumRows(), nrhs); // temp workspace (store permuted rhs)

{
Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> random(13718);
Kokkos::fill_random(b, random, value_type(1));
}

///
/// solve
///
{
timer.reset();
applyRowPermutationToDenseMatrix(bb, b, perm);
cusolver.solve(xx, bb);
applyRowPermutationToDenseMatrix(x, xx, peri);
Kokkos::fence();
const double t_solve = timer.seconds();
if (verbose) {
printf("ExampleCuSolver: P b, solve, and P^{-1} x\n");
printf(" Time\n");
printf(" time for permute and solve: %10.6f s\n", t_solve);
printf("\n");
}
}

///
/// compute residual to check solutions
///
const double res = computeRelativeResidual(A, x, b);

std::cout << "cuSolver: residual = " << res << "\n\n";

}
#else
r_val = -1;
std::cout << "CUDA is NOT configured in Trilinos" << std::endl;
#endif

Kokkos::finalize();

return r_val;
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,18 @@ int main (int argc, char *argv[]) {
if (r_parse) return 0; // print help return

const bool skip_factorize = false, skip_solve = false;

typedef Kokkos::DefaultHostExecutionSpace host_space;

Kokkos::initialize(argc, argv);
host_space::print_configuration(std::cout, false);

typedef UseThisDevice<Kokkos::DefaultHostExecutionSpace> host_device_type;
printExecSpaceConfiguration<typename host_device_type::execution_space>("HostDevice", false);

int r_val = 0;
#if defined (__INTEL_MKL__)
{
typedef double value_type;
typedef CrsMatrixBase<value_type,host_space> CrsMatrixBaseType;
typedef Kokkos::View<value_type**,Kokkos::LayoutLeft,host_space> DenseMatrixBaseType;
typedef CrsMatrixBase<value_type,host_device_type> CrsMatrixBaseType;
typedef Kokkos::View<value_type**,Kokkos::LayoutLeft,host_device_type> DenseMatrixBaseType;

// mkl nthreads setting
mkl_set_dynamic(0);
Expand Down Expand Up @@ -104,7 +104,7 @@ int main (int argc, char *argv[]) {
t = timer.seconds();

// 32bit vs 64bit integers; A uses size_t for size array
Kokkos::View<ordinal_type*,host_space> rowptr("rowptr", Asym.NumRows()+1);
Kokkos::View<ordinal_type*,host_device_type> rowptr("rowptr", Asym.NumRows()+1);
{
for (ordinal_type i=0;i<=Asym.NumRows();++i)
rowptr(i) = Asym.RowPtrBegin(i);
Expand Down Expand Up @@ -184,7 +184,7 @@ int main (int argc, char *argv[]) {
}

{
const double res = NumericTools<value_type,host_space>::computeRelativeResidual(A, X, B);
const double res = computeRelativeResidual(A, X, B);
std::cout << "PardisoChol:: residual = " << res << std::endl;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ void testTachoSolver(int numRows,
tachoParams[tacho::PANELSIZE] = 32;
#else
# ifdef KOKKOS_ENABLE_DEPRECATED_CODE
tachoParams[tacho::MAXNUMSUPERBLOCKS] = Kokkos::DefaultHostExecutionSpace::thread_pool_size(0)/2;
tachoParams[tacho::MAXNUMSUPERBLOCKS] = tacho::host_space::thread_pool_size(0)/2;
# else
tachoParams[tacho::MAXNUMSUPERBLOCKS] = Kokkos::DefaultHostExecutionSpace::impl_thread_pool_size(0)/2;
tachoParams[tacho::MAXNUMSUPERBLOCKS] = tacho::host_space::impl_thread_pool_size(0)/2;
# endif
tachoParams[tacho::BLOCKSIZE] = 256;
tachoParams[tacho::PANELSIZE] = 128;
Expand Down Expand Up @@ -83,14 +83,14 @@ void testTachoSolver(int numRows,

/// this example only works for single right hand side
const int NRHS = 1;
typedef Kokkos::View<double**, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace> ViewVectorType;
typedef Kokkos::View<double**, Kokkos::LayoutLeft, tacho::device_type> ViewVectorType;
ViewVectorType x("x", numRows, NRHS);

#if defined (KOKKOS_ENABLE_CUDA)
/// transfer b into device
ViewVectorType b(Kokkos::ViewAllocateWithoutInitializing("b"), numRows, NRHS);
Kokkos::deep_copy(Kokkos::subview(b, Kokkos::ALL(), 0),
Kokkos::View<double*,Kokkos::DefaultHostExecutionSpace>(rhs.data(), numRows));
Kokkos::View<double*,tacho::device_type>(rhs.data(), numRows));
#else
/// wrap rhs data with view
ViewVectorType b(rhs.data(), numRows, NRHS);
Expand Down Expand Up @@ -135,11 +135,11 @@ int main(int argc, char *argv[]) {

Kokkos::initialize(argc, argv);
const bool detail = false;
Tacho::printExecSpaceConfiguration<Kokkos::DefaultExecutionSpace>("DeviceSpace", detail);
Tacho::printExecSpaceConfiguration<Kokkos::DefaultHostExecutionSpace>("HostSpace", detail);
Tacho::printExecSpaceConfiguration<tacho::exec_space>("DeviceSpace", detail);
Tacho::printExecSpaceConfiguration<tacho::host_space>("HostSpace", detail);
{

Tacho::CrsMatrixBase<double,Kokkos::DefaultHostExecutionSpace> A;
Tacho::CrsMatrixBase<double,tacho::host_device_type> A;
{
std::ifstream in;
in.open(file);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,18 @@ namespace tacho {
INDEX_LENGTH
};

typedef Kokkos::View<double*> ViewVectorType;
typedef Kokkos::View<int*> ViewVectorTypeInt;
using device_type = typename Tacho::UseThisDevice<Kokkos::DefaultExecutionSpace>::device_type;
using exec_space = typename device_type::execution_space;

using host_device_type = typename Tacho::UseThisDevice<Kokkos::DefaultHostExecutionSpace>::device_type;
using host_space = typename host_device_type::execution_space;

using ViewVectorType = Kokkos::View<double*,device_type>;
using ViewVectorTypeInt = Kokkos::View<int*,device_type>;

template <class SX> class tachoSolver
{
public:
using exec_space = Kokkos::DefaultExecutionSpace;
using sched_type = Kokkos::TaskSchedulerMultiple<exec_space>;
typedef Tacho::Solver<SX,sched_type> solver_type;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,13 @@ int main (int argc, char *argv[]) {
typedef int ordinal_type;
typedef double value_type;

/// device type
typedef typename Tacho::UseThisDevice<Kokkos::DefaultHostExecutionSpace>::device_type host_device_type;

/// crs matrix format and dense multi vector
typedef Tacho::CrsMatrixBase<value_type,Kokkos::DefaultHostExecutionSpace> CrsMatrixBaseType;
typedef Kokkos::View<value_type**,Kokkos::LayoutLeft,Kokkos::DefaultHostExecutionSpace> DenseMultiVectorType;
//typedef Kokkos::View<ordinal_type*,Kokkos::DefaultHostExecutionSpace> OrdinalTypeArray;
typedef Tacho::CrsMatrixBase<value_type,host_device_type> CrsMatrixBaseType;
typedef Kokkos::View<value_type**,Kokkos::LayoutLeft,host_device_type> DenseMultiVectorType;
//typedef Kokkos::View<ordinal_type*,host_device_type> OrdinalTypeArray;

///
/// problem setting
Expand Down Expand Up @@ -202,7 +205,7 @@ int main (int argc, char *argv[]) {
}

// 32bit vs 64bit integers; A uses size_t for size array
Kokkos::View<ordinal_type*,Kokkos::DefaultHostExecutionSpace> rowptr("rowptr", Asym.NumRows()+1);
Kokkos::View<ordinal_type*,host_device_type> rowptr("rowptr", Asym.NumRows()+1);
for (ordinal_type i=0;i<=Asym.NumRows();++i)
rowptr(i) = Asym.RowPtrBegin(i);

Expand Down Expand Up @@ -265,7 +268,7 @@ int main (int argc, char *argv[]) {
pardiso.showStat(std::cout, Pardiso::Solve) << std::endl;
}

const double res = Tacho::NumericTools<value_type,Kokkos::DefaultHostExecutionSpace>::computeRelativeResidual(A, x, b);
const double res = Tacho::computeRelativeResidual(A, x, b);
std::cout << "PardisoChol:: residual = " << res << "\n\n";

r_val = pardiso.run(Pardiso::ReleaseAll);
Expand Down
Loading

0 comments on commit cab36fd

Please sign in to comment.