diff --git a/cmake/ctest/drivers/atdm/ats2/local-driver.sh b/cmake/ctest/drivers/atdm/ats2/local-driver.sh index 8704a96fb282..eb9ffd6ab5aa 100755 --- a/cmake/ctest/drivers/atdm/ats2/local-driver.sh +++ b/cmake/ctest/drivers/atdm/ats2/local-driver.sh @@ -32,6 +32,8 @@ fi # Allow default setting for TPETRA_ASSUME_CUDA_AWARE_MPI=0 in trilinos_jsrun unset TPETRA_ASSUME_CUDA_AWARE_MPI +atdm_config_ctest_regex_old="$ATDM_CONFIG_CTEST_REGEX" +export ATDM_CONFIG_CTEST_REGEX="$ATDM_CONFIG_CTEST_REGEX -E Adelus*" echo echo "=======================================================================" @@ -44,6 +46,8 @@ set -x $WORKSPACE/Trilinos/cmake/ctest/drivers/atdm/ats2/local-driver-single-build.sh set +x +export ATDM_CONFIG_CTEST_REGEX="$atdm_config_ctest_regex_old" + if [[ "${Trilinos_CTEST_RUN_CUDA_AWARE_MPI}" == "1" ]]; then echo echo "=======================================================================" diff --git a/cmake/std/PullRequestLinuxCuda10.1.105uvmOffTestingSettings.cmake b/cmake/std/PullRequestLinuxCuda10.1.105uvmOffTestingSettings.cmake index aa86835f17de..0de8d2ac8767 100644 --- a/cmake/std/PullRequestLinuxCuda10.1.105uvmOffTestingSettings.cmake +++ b/cmake/std/PullRequestLinuxCuda10.1.105uvmOffTestingSettings.cmake @@ -152,7 +152,6 @@ set (Kokkos_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (KokkosKernels_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (MueLu_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (NOX_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") -set (Phalanx_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (ROL_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (Sacado_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (SEACAS_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") @@ -163,127 +162,8 @@ set (Xpetra_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (Zoltan2_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") # Tpetra UVM = OFF tests -set (TpetraCore_BlockCrsMatrix_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_Bug5072_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_BlankRowBugTest_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_iallreduce_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_idot_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsGraph_UnitTests0_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsGraph_UnitTests1_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsGraph_UnitTests_Swap_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsGraph_ReindexColumns_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_Issue601_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_Issue601_MPI_8_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsGraph_insertGlobalIndicesFiltered_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsGraph_PackUnpack_MPI_1_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsGraph_getNumDiags_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsGraph_UnpackIntoStaticGraph_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsGraph_StaticImportExport_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsGraph_UnpackMerge_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_UnitTests2_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_UnitTests3_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_UnitTests4_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_UnitTests_Swap_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_NonlocalAfterResume_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_LeftRightScale_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_2DRandomDist_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_WithGraph_Cuda_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_ReplaceDomainMapAndImporter_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_NonlocalSumInto_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_NonlocalSumInto_Ignore_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_Bug5978_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_Bug6069_1_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_Bug6069_2_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_Bug6171_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_ReplaceLocalValues_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_ReplaceDiagonal_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_MultipleFillCompletes_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_ReindexColumns_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_TransformValues_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_GetRowCopy_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_PackUnpack_MPI_1_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_Equilibration_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_StaticImportExport_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_sumIntoStaticProfileExtraSpace_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_createDeepCopy_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_UnpackMerge_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_Bug7745_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_Bug8794_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_RemoveEmptyProcesses_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_Albany182_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_Distributor_CreateFromSendsAndRecvs_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_Distributor_CreateFromSendsAndRecvs_MPI_8_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_Issue1752_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_FECrsGraph_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_FECrsMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_FEMultiVector_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_FixedHashTableTest_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_computeOffsetsFromCounts_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_ImportExport_ImportConstructExpert_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_UnpackLongRows_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_ExportToStaticGraphCrsMatrix_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_ImportExport2_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_InOutTest_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Tpetra_CrsGraph_InOutTest_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMarket_Operator_Test_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_MatrixMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_FECrs_MatrixMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_copyConvert_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_StaticView_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_RowMatrixTransposer_test_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_RowMatrixTransposer_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_CrsMatrix_transpose_sortedRows_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_lesson03_power_method_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_lesson05_redistribution_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_FEMAssembly_InsertGlobalIndicesFESP_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_FEMAssembly_InsertGlobalIndicesFESPKokkos_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_FEMAssembly_TotalElementLoopSP_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_FEMAssembly_TotalElementLoopSPKokkos_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_AdditiveSchwarzHalo_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_BlockCrsPerfTest_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_NewReaderExample_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_NewReaderExample_rmat_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_guide_power_method_1_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_guide_matrix_fill_1_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_guide_data_redist_1_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") -set (TpetraCore_EpetraRowMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") - +set (TpetraCore_BlockCrsMatrix_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for UVM build") +set (TpetraCore_BlockCrsPerfTest_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for UVM build") # ShyLU_DD UVM = OFF tests set (ShyLU_DDFROSch_test_thyra_xpetra_laplace_one_rank_TLP_IPOU_DIM3_TPETRA_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") diff --git a/cmake/std/atdm/ats2/tweaks/Tweaks.cmake b/cmake/std/atdm/ats2/tweaks/Tweaks.cmake index f707b9fd1dbe..96d699941c0d 100644 --- a/cmake/std/atdm/ats2/tweaks/Tweaks.cmake +++ b/cmake/std/atdm/ats2/tweaks/Tweaks.cmake @@ -10,6 +10,12 @@ ATDM_SET_CACHE(Trilinos_CUDA_SLOTS_PER_GPU 2 CACHE STRING) # Disables across multiple builds on 'ats2' # +IF (ATDM_NODE_TYPE STREQUAL "CUDA") + # Adelus always needs -M -gpu passed to jsrun, but trilinos_jsrun cannot support this + # for single rank MPI processes without breaking the invocation of other Trilinos tests + ATDM_SET_ENABLE(Adelus_vector_random_MPI_1_DISABLE ON) +ENDIF() + IF (ATDM_CMAKE_BUILD_TYPE STREQUAL "DEBUG") # Disable some expensive KokkosKernels tests in pure debug builds (#6464) diff --git a/packages/amesos2/src/Amesos2_Factory.cpp b/packages/amesos2/src/Amesos2_Factory.cpp index ecfa91bdd245..dfa6a7f5a34d 100644 --- a/packages/amesos2/src/Amesos2_Factory.cpp +++ b/packages/amesos2/src/Amesos2_Factory.cpp @@ -169,11 +169,10 @@ namespace Amesos2 { std::string tolower (const std::string& s) { - std::locale loc; std::string rtn = s; const size_t len = rtn.length (); for (size_t i = 0; i < len; ++i) { - rtn[i] = tolower (rtn[i], loc); + rtn[i] = ::tolower (rtn[i]); } return rtn; } diff --git a/packages/amesos2/src/KLU2/Include/klu2_ordinaltraits.h b/packages/amesos2/src/KLU2/Include/klu2_ordinaltraits.h index 572115c0651d..c7a4cf411178 100644 --- a/packages/amesos2/src/KLU2/Include/klu2_ordinaltraits.h +++ b/packages/amesos2/src/KLU2/Include/klu2_ordinaltraits.h @@ -110,35 +110,37 @@ struct KLU_OrdinalTraits }; template<> -struct KLU_OrdinalTraits +struct KLU_OrdinalTraits { - static inline long int btf_order (long int n, long int *Ap, long int *Ai, - double maxwork, double *work, long int *P, long int *Q, long int *R, long int *nmatch, - long int *Work) +// These should all be UF_long, which I presume is resolving to ptrdiff_t +// ptrdiff_t is ptrdiff_t on Linux64, but just to be safe + static inline ptrdiff_t btf_order (ptrdiff_t n, ptrdiff_t *Ap, ptrdiff_t *Ai, + double maxwork, double *work, ptrdiff_t *P, ptrdiff_t *Q, ptrdiff_t *R, ptrdiff_t *nmatch, + ptrdiff_t *Work) { return (trilinos_btf_l_order (n, Ap, Ai, maxwork, work, P, Q, R, nmatch, Work)); } - static inline long int btf_strongcomp (long int n, long int *Ap, long int *Ai, long int *Q, - long int *P, long int *R, long int *Work) + static inline ptrdiff_t btf_strongcomp (ptrdiff_t n, ptrdiff_t *Ap, ptrdiff_t *Ai, ptrdiff_t *Q, + ptrdiff_t *P, ptrdiff_t *R, ptrdiff_t *Work) { return(trilinos_btf_l_strongcomp (n, Ap, Ai, Q, P, R, Work)) ; } - static inline long int amd_order (long int n, long int *Ap, long int *Ai, long int *P, + static inline ptrdiff_t amd_order (ptrdiff_t n, ptrdiff_t *Ap, ptrdiff_t *Ai, ptrdiff_t *P, double *Control, double *Info) { return (trilinos_amd_l_order(n, Ap, Ai, P, Control, Info)) ; } - static inline long int colamd (long int n_row, long int n_col, long int Alen, long int *A, - long int *p, double *knobs, long int *stats) + static inline ptrdiff_t colamd (ptrdiff_t n_row, ptrdiff_t n_col, ptrdiff_t Alen, ptrdiff_t *A, + ptrdiff_t *p, double *knobs, ptrdiff_t *stats) { return(trilinos_colamd_l (n_row, n_col, Alen, A, p, knobs, stats)); } - static inline long int colamd_recommended (long int nnz, long int n_row, long int n_col) + static inline ptrdiff_t colamd_recommended (ptrdiff_t nnz, ptrdiff_t n_row, ptrdiff_t n_col) { return(trilinos_colamd_l_recommended(nnz, n_row, n_col)); } diff --git a/packages/anasazi/tpetra/example/TraceMinDavidson/TraceMinDavidsonLaplacianEx.cpp b/packages/anasazi/tpetra/example/TraceMinDavidson/TraceMinDavidsonLaplacianEx.cpp index b211af00e978..d6139da60a32 100644 --- a/packages/anasazi/tpetra/example/TraceMinDavidson/TraceMinDavidsonLaplacianEx.cpp +++ b/packages/anasazi/tpetra/example/TraceMinDavidson/TraceMinDavidsonLaplacianEx.cpp @@ -383,12 +383,10 @@ void formLaplacian(const RCP& A, const bool weighted, const boo if(weighted) { - // These vectors hold the actual data - // The ArrayView objects just point to them - std::vector colIndices; - std::vector values; - Teuchos::ArrayView colIndicesView; - Teuchos::ArrayView valuesView; + using indices_view = typename CrsMatrix::nonconst_global_inds_host_view_type; + using values_view = typename CrsMatrix::nonconst_values_host_view_type; + indices_view colIndices("colIndices"); + values_view values("values"); // This vector holds the diagonal RCP diagonal = Teuchos::rcp(new Vector(rowMap)); @@ -406,15 +404,11 @@ void formLaplacian(const RCP& A, const bool weighted, const boo { // Figure out how many entries are in the row size_t numentries = L->getNumEntriesInGlobalRow(i); - colIndices.resize(numentries); - values.resize(numentries); - - // Point the array views to the vectors - colIndicesView = Teuchos::arrayViewFromVector(colIndices); - valuesView = Teuchos::arrayViewFromVector(values); + Kokkos::resize(colIndices,numentries); + Kokkos::resize(values,numentries); // Get a copy of row i - L->getGlobalRowCopy(i,colIndicesView,valuesView,numentries); + L->getGlobalRowCopy(i,colIndices,values,numentries); for(size_t j=0; j& A, const bool weighted, const boo } // Reinsert the updated row - L->replaceGlobalValues(i, colIndicesView, valuesView); + L->replaceGlobalValues(i, colIndices, values); } } diff --git a/packages/belos/epetra/test/MINRES/CMakeLists.txt b/packages/belos/epetra/test/MINRES/CMakeLists.txt index 09a6a9a46fae..877a25fefa80 100644 --- a/packages/belos/epetra/test/MINRES/CMakeLists.txt +++ b/packages/belos/epetra/test/MINRES/CMakeLists.txt @@ -35,4 +35,19 @@ IF (${PACKAGE_NAME}_ENABLE_Triutils) EXEDEPS minres_hb ) + ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_Ifpack) + IF(${PACKAGE_NAME}_ENABLE_Ifpack) + + TRIBITS_ADD_EXECUTABLE_AND_TEST( + pminres_hb + SOURCES test_pminres_hb.cpp + COMM serial mpi + ARGS + "--verbose --filename=bcsstk14.hb --left-prec --max-iters=100" + "--verbose --filename=bcsstk14.hb --right-prec --max-iters=100" + STANDARD_PASS_OUTPUT + ) + + ENDIF(${PACKAGE_NAME}_ENABLE_Ifpack) + ENDIF(${PACKAGE_NAME}_ENABLE_Triutils) diff --git a/packages/belos/epetra/test/MINRES/test_pminres_hb.cpp b/packages/belos/epetra/test/MINRES/test_pminres_hb.cpp new file mode 100644 index 000000000000..f4f76ef86b02 --- /dev/null +++ b/packages/belos/epetra/test/MINRES/test_pminres_hb.cpp @@ -0,0 +1,255 @@ +//@HEADER +// ************************************************************************ +// +// Belos: Block Linear Solvers Package +// Copyright 2004 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +//@HEADER +// +// This driver reads a problem from a Harwell-Boeing (HB) file. +// Multiple right-hand-sides are created randomly. +// The initial guesses are all set to zero. +// +#include "BelosConfigDefs.hpp" +#include "BelosLinearProblem.hpp" +#include "BelosEpetraAdapter.hpp" +#include "BelosMinresSolMgr.hpp" +#include "BelosEpetraUtils.h" +#include "Trilinos_Util.h" +#include "Epetra_CrsMatrix.h" +#include "Epetra_Map.h" +#include "Teuchos_CommandLineProcessor.hpp" +#include "Teuchos_ParameterList.hpp" +#include "Teuchos_StandardCatchMacros.hpp" + +#include "Ifpack.h" +// +int main(int argc, char *argv[]) { + // + Teuchos::GlobalMPISession session(&argc, &argv, NULL); + // + using Teuchos::ParameterList; + using Teuchos::RCP; + using Teuchos::rcp; + + bool success = false; + bool verbose = false; + try { + // + // Get test parameters from command-line processor + // + bool proc_verbose = false; + bool leftprec = true; // left preconditioning or right. + int frequency = -1; // how often residuals are printed by solver + int numrhs = 5; // total number of right-hand sides to solve for + int maxiters = -1; // maximum number of iterations for the solver to use + std::string filename("bcsstk14.hb"); + double tol = 1.0e-5; // relative residual tolerance + + Teuchos::CommandLineProcessor cmdp(false,true); + cmdp.setOption("verbose","quiet",&verbose,"Print messages and results."); + cmdp.setOption("left-prec","right-prec",&leftprec,"Left preconditioning or right."); + cmdp.setOption("frequency",&frequency,"Solvers frequency for printing residuals (#iters)."); + cmdp.setOption("filename",&filename,"Filename for Harwell-Boeing test matrix."); + cmdp.setOption("tol",&tol,"Relative residual tolerance used by Minres solver."); + cmdp.setOption("num-rhs",&numrhs,"Number of right-hand sides to be solved for."); + cmdp.setOption("max-iters",&maxiters,"Maximum number of iterations per linear system (-1 := adapted to problem/block size)."); + + if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) { + return -1; + } + if (!verbose) + frequency = -1; // Reset frequency if verbosity is off + // + // Get the problem + // + int MyPID; + RCP A; + RCP X, B; + int return_val =Belos::Util::createEpetraProblem(filename,NULL,&A,&B,&X,&MyPID); + if(return_val != 0) return return_val; + proc_verbose = ( verbose && (MyPID==0) ); + // + // Solve using Belos + // + typedef double ST; + typedef Epetra_Operator OP; + typedef Epetra_MultiVector MV; + typedef Belos::OperatorTraits OPT; + typedef Belos::MultiVecTraits MVT; + // + // *****Construct initial guess and random right-hand-sides ***** + // + if (numrhs != 1) { + X = rcp( new Epetra_MultiVector( A->Map(), numrhs ) ); + MVT::MvRandom( *X ); + B = rcp( new Epetra_MultiVector( A->Map(), numrhs ) ); + OPT::Apply( *A, *X, *B ); + MVT::MvInit( *X, 0.0 ); + } + // + // ************Construct preconditioner************* + // + ParameterList ifpackList; + + // allocates an IFPACK factory. No data is associated + // to this object (only method Create()). + Ifpack Factory; + + // create the preconditioner. For valid PrecType values, + // please check the documentation + std::string PrecType = "ICT"; // incomplete Cholesky + int OverlapLevel = 0; // must be >= 0. If Comm.NumProc() == 1, + // it is ignored. + + RCP Prec = Teuchos::rcp( Factory.Create(PrecType, &*A, OverlapLevel) ); + assert(Prec != Teuchos::null); + + // specify parameters for ICT + ifpackList.set("fact: drop tolerance", 1e-9); + ifpackList.set("fact: ict level-of-fill", 1.0); + // the combine mode is on the following: + // "Add", "Zero", "Insert", "InsertAdd", "Average", "AbsMax" + // Their meaning is as defined in file Epetra_CombineMode.h + ifpackList.set("schwarz: combine mode", "Add"); + // sets the parameters + IFPACK_CHK_ERR(Prec->SetParameters(ifpackList)); + + // initialize the preconditioner. At this point the matrix must + // have been FillComplete()'d, but actual values are ignored. + IFPACK_CHK_ERR(Prec->Initialize()); + + // Builds the preconditioners, by looking for the values of + // the matrix. + IFPACK_CHK_ERR(Prec->Compute()); + + // Create the Belos preconditioned operator from the Ifpack preconditioner. + // NOTE: This is necessary because Belos expects an operator to apply the + // preconditioner with Apply() NOT ApplyInverse(). + RCP belosPrec = rcp( new Belos::EpetraPrecOp( Prec ) ); + + // + // *****Create parameter list for the Minres solver manager***** + // + const int NumGlobalElements = B->GlobalLength(); + if (maxiters == -1) + maxiters = NumGlobalElements - 1; // maximum number of iterations to run + // + ParameterList belosList; + belosList.set( "Maximum Iterations", maxiters ); // Maximum number of iterations allowed + belosList.set( "Convergence Tolerance", tol ); // Relative convergence tolerance requested + if (verbose) { + belosList.set( "Verbosity", Belos::Errors + Belos::Warnings + + Belos::TimingDetails + Belos::FinalSummary + Belos::StatusTestDetails ); + if (frequency > 0) + belosList.set( "Output Frequency", frequency ); + } + else + belosList.set( "Verbosity", Belos::Errors + Belos::Warnings ); + // + // *******Construct a preconditioned linear problem******** + // + RCP > problem + = rcp( new Belos::LinearProblem( A, X, B ) ); + if (leftprec) { + problem->setLeftPrec( belosPrec ); + } + else { + problem->setRightPrec( belosPrec ); + } + + bool set = problem->setProblem(); + if (set == false) { + if (proc_verbose) + std::cout << std::endl << "ERROR: Belos::LinearProblem failed to set up correctly!" << std::endl; + return -1; + } + + // Create an iterative solver manager. + RCP< Belos::SolverManager > solver + = rcp( new Belos::MinresSolMgr(problem, rcp(&belosList,false)) ); + + // + // ******************************************************************* + // *************Start the Minres iteration************************* + // ******************************************************************* + if (proc_verbose) { + std::cout << std::endl << std::endl; + std::cout << "Dimension of matrix: " << NumGlobalElements << std::endl; + std::cout << "Number of right-hand sides: " << numrhs << std::endl; + std::cout << "Max number of Minres iterations: " << maxiters << std::endl; + std::cout << "Relative residual tolerance: " << tol << std::endl; + std::cout << std::endl; + } + // + // Perform solve + // + Belos::ReturnType ret = solver->solve(); + // + // Compute actual residuals. + // + bool badRes = false; + std::vector actual_resids( numrhs ); + std::vector rhs_norm( numrhs ); + Epetra_MultiVector resid(A->Map(), numrhs); + OPT::Apply( *A, *X, resid ); + MVT::MvAddMv( -1.0, resid, 1.0, *B, resid ); + MVT::MvNorm( resid, actual_resids ); + MVT::MvNorm( *B, rhs_norm ); + if (proc_verbose) { + std::cout<< "---------- Actual Residuals (normalized) ----------"< tol) badRes = true; + } + } + + success = ret==Belos::Converged && !badRes; + + if (success) { + if (proc_verbose) + std::cout << std::endl << "End Result: TEST PASSED" << std::endl; + } else { + if (proc_verbose) + std::cout << std::endl << "End Result: TEST FAILED" << std::endl; + } + } + TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); + + return ( success ? EXIT_SUCCESS : EXIT_FAILURE ); +} // end test_bl_pcg_hb.cpp + diff --git a/packages/belos/src/BelosMinresIter.hpp b/packages/belos/src/BelosMinresIter.hpp index 5cc4a452c6ce..4ca2de6e82ea 100644 --- a/packages/belos/src/BelosMinresIter.hpp +++ b/packages/belos/src/BelosMinresIter.hpp @@ -86,8 +86,8 @@ namespace Belos { /// Implementation of the preconditioned Minimal Residual Method /// (MINRES) iteration. This a bilinear form implementation, that /// uses inner products of the form to solve the preconditioned -/// linear system M^{-1}*A x = b. Thus, it is necessary that the -/// left preconditioner M is positive definite. +/// linear system. Thus, it is necessary that the left preconditioner +/// M is positive definite. /// /// \ingroup belos_solver_framework /// @@ -402,7 +402,6 @@ class MinresIter : virtual public MinresIteration { // Create convenience variables for zero, one. const ScalarType one = SCT::one(); - const MagnitudeType zero = SMT::zero(); const MagnitudeType m_zero = SMT::zero(); // Set up y and v for the first Lanczos vector v_1. @@ -417,6 +416,13 @@ class MinresIter : virtual public MinresIteration { if ( lp_->getLeftPrec() != Teuchos::null ) { lp_->applyLeftPrec( *newstate.Y, *Y_ ); + if ( lp_->getRightPrec() != Teuchos::null ) { + Teuchos::RCP tmp = MVT::CloneCopy( *Y_ ); + lp_->applyRightPrec( *tmp, *Y_ ); + } + } + else if ( lp_->getRightPrec() != Teuchos::null ) { + lp_->applyRightPrec( *newstate.Y, *Y_ ); } else { if (newstate.Y != Y_) { @@ -433,7 +439,7 @@ class MinresIter : virtual public MinresIteration { std::invalid_argument, "The preconditioner is not positive definite." ); - if( SCT::magnitude(beta1_(0,0)) == zero ) + if( SCT::magnitude(beta1_(0,0)) == m_zero ) { // X = 0 Teuchos::RCP cur_soln_vec = lp_->getCurrLHSVec(); @@ -470,7 +476,6 @@ class MinresIter : virtual public MinresIteration { Teuchos::SerialDenseMatrix alpha( 1, 1 ); Teuchos::SerialDenseMatrix beta( beta1_ ); phibar_ = Teuchos::ScalarTraits::magnitude( beta1_(0,0) ); - ScalarType shift = zero; // TODO Allow for proper shift. // Initialize a few variables. ScalarType oldBeta = zero; @@ -513,10 +518,6 @@ class MinresIter : virtual public MinresIteration { // Apply operator. lp_->applyOp (*V, *Y_); - // Apply shift - if (shift != zero) - MVT::MvAddMv (one, *Y_, -shift, *V, *Y_); - if (iter_ > 1) MVT::MvAddMv (one, *Y_, -beta(0,0)/oldBeta, *R1_, *Y_); @@ -533,12 +534,19 @@ class MinresIter : virtual public MinresIteration { R2_ = Y_; Y_ = tmpY; - // apply left preconditioner + // apply preconditioner if ( lp_->getLeftPrec() != Teuchos::null ) { lp_->applyLeftPrec( *R2_, *Y_ ); + if ( lp_->getRightPrec() != Teuchos::null ) { + Teuchos::RCP tmp = MVT::CloneCopy( *Y_ ); + lp_->applyRightPrec( *tmp, *Y_ ); + } + } + else if ( lp_->getRightPrec() != Teuchos::null ) { + lp_->applyRightPrec( *R2_, *Y_ ); } // else "y = r2" else { - MVT::MvAddMv( one, *R2_, zero, *R2_, *Y_ ); + MVT::Assign( *R2_, *Y_ ); } // Get new beta. @@ -594,8 +602,8 @@ class MinresIter : virtual public MinresIteration { // Update x: // x = x + phi*w; - //MVT::MvAddMv( one, *cur_soln_vec, phi, *W_, *cur_soln_vec ); - lp_->updateSolution( W_, true, phi ); + MVT::MvAddMv( one, *cur_soln_vec, phi, *W_, *cur_soln_vec ); + lp_->updateSolution(); } // end while (sTest_->checkStatus(this) != Passed) } diff --git a/packages/epetra/src/Epetra_Comm.h b/packages/epetra/src/Epetra_Comm.h index 65c9743f246c..9f482c494c04 100644 --- a/packages/epetra/src/Epetra_Comm.h +++ b/packages/epetra/src/Epetra_Comm.h @@ -460,6 +460,10 @@ class EPETRA_LIB_DLL_EXPORT Epetra_Comm { //! Create a distributor object. virtual Epetra_Distributor * CreateDistributor() const = 0; //! Create a directory object for the given Epetra_BlockMap. +// CreateDirectory is defined in Winbase.h as a macro! +#ifdef CreateDirectory +#undef CreateDirectory +#endif virtual Epetra_Directory * CreateDirectory(const Epetra_BlockMap & Map) const = 0; //@} diff --git a/packages/epetra/src/Epetra_CrsGraph.cpp b/packages/epetra/src/Epetra_CrsGraph.cpp index adc1e5c96781..cb460126cd0d 100644 --- a/packages/epetra/src/Epetra_CrsGraph.cpp +++ b/packages/epetra/src/Epetra_CrsGraph.cpp @@ -3065,3 +3065,13 @@ Epetra_CrsGraph& Epetra_CrsGraph::operator = (const Epetra_CrsGraph& Source) { return(*this); } + +//============================================================================= +Epetra_IntSerialDenseVector& Epetra_CrsGraph::ExpertExtractIndexOffset(){ + return CrsGraphData_->IndexOffset_; + } + +//============================================================================= +Epetra_IntSerialDenseVector& Epetra_CrsGraph::ExpertExtractIndices() { + return CrsGraphData_->data->All_Indices_; + } diff --git a/packages/epetra/src/Epetra_CrsGraph.h b/packages/epetra/src/Epetra_CrsGraph.h index 59febb14c9b8..96ac3cfc1638 100644 --- a/packages/epetra/src/Epetra_CrsGraph.h +++ b/packages/epetra/src/Epetra_CrsGraph.h @@ -1004,8 +1004,22 @@ class EPETRA_LIB_DLL_EXPORT Epetra_CrsGraph: public Epetra_DistObject { //! Returns a pointer to the CrsGraphData instance this CrsGraph uses. /*! (Intended for developer use only for testing purposes.) */ const Epetra_CrsGraphData* DataPtr() const {return(CrsGraphData_);} + - //! Forces FillComplete() to locally order ghostnodes associated with each remote processor in ascending order. + //! Returns a reference to the Epetra_IntSerialDenseVector used to hold the local IndexOffsets (CRS rowptr) + /*! + \warning This method is intended for experts only, its use may require user code modifications in future versions of Epetra. + */ + Epetra_IntSerialDenseVector& ExpertExtractIndexOffset(); + + //! Returns a reference to the Epetra_IntSerialDenseVector used to hold the local All_Indices (CRS colind) + /*! + \warning This method is intended for experts only, its use may require user code modifications in future versions of Epetra. + */ + Epetra_IntSerialDenseVector& ExpertExtractIndices(); + + + //! Forces FillComplete() to locally order ghostnodes associated with each remote processor in ascending order. /*! To be compliant with AztecOO, FillComplete() already locally orders ghostnodes such that information received from processor k has a lower local numbering than information received from processor j if k is less than j. SortGhostsAssociatedWithEachProcessor(True) further diff --git a/packages/epetra/src/Epetra_MpiComm.h b/packages/epetra/src/Epetra_MpiComm.h index abcd5723e1c4..6a14cb5d6797 100644 --- a/packages/epetra/src/Epetra_MpiComm.h +++ b/packages/epetra/src/Epetra_MpiComm.h @@ -474,6 +474,10 @@ class EPETRA_LIB_DLL_EXPORT Epetra_MpiComm: public Epetra_Object, public virtual //! Create a distributor object. Epetra_Distributor * CreateDistributor() const; //! Create a directory object for the given Epetra_BlockMap. +// CreateDirectory is defined in Winbase.h as a macro! +#ifdef CreateDirectory +#undef CreateDirectory +#endif Epetra_Directory * CreateDirectory(const Epetra_BlockMap & Map) const; //@} diff --git a/packages/epetra/src/Epetra_SerialComm.h b/packages/epetra/src/Epetra_SerialComm.h index 1a5ca0a47d68..43f38d505bec 100644 --- a/packages/epetra/src/Epetra_SerialComm.h +++ b/packages/epetra/src/Epetra_SerialComm.h @@ -441,6 +441,10 @@ class EPETRA_LIB_DLL_EXPORT Epetra_SerialComm: public Epetra_Object, public virt //! Create a distributor object. Epetra_Distributor * CreateDistributor() const; //! Create a directory object for the given Epetra_BlockMap. +// CreateDirectory is defined in Winbase.h as a macro! +#ifdef CreateDirectory +#undef CreateDirectory +#endif Epetra_Directory * CreateDirectory(const Epetra_BlockMap & Map) const; //@} diff --git a/packages/ifpack/src/Ifpack_Hypre.cpp b/packages/ifpack/src/Ifpack_Hypre.cpp index 5c9a1cef6e92..9fe6a6d43624 100644 --- a/packages/ifpack/src/Ifpack_Hypre.cpp +++ b/packages/ifpack/src/Ifpack_Hypre.cpp @@ -53,6 +53,7 @@ #include "HYPRE_parcsr_ls.h" #include "krylov.h" #include "_hypre_parcsr_mv.h" +#include "_hypre_parcsr_ls.h" #include "_hypre_IJ_mv.h" #include "HYPRE_parcsr_mv.h" #include "HYPRE.h" @@ -792,6 +793,40 @@ int Ifpack_Hypre::Compute(){ IFPACK_CHK_ERR(PrecondSetupPtr_(Preconditioner_, ParMatrix_, ParX_, ParY_)); } + // Dump Hierarchy here for BoomerAMG Preconditioner + if(Dump_ && PrecondSolvePtr_ == &HYPRE_BoomerAMGSolve) { + hypre_ParAMGData *amg_data = (hypre_ParAMGData*) Preconditioner_; + hypre_ParCSRMatrix **A_array = hypre_ParAMGDataAArray(amg_data); + hypre_ParCSRMatrix **P_array = hypre_ParAMGDataPArray(amg_data); + HYPRE_Int **CF_marker_array = hypre_ParAMGDataCFMarkerArray(amg_data); + HYPRE_Int num_levels = hypre_ParAMGDataNumLevels(amg_data); + + char ofs[80]; + for(int k=0; k::initializePrec( // precTypeUpper is the upper-case version of preconditionerType. std::string precTypeUpper (preconditionerType); if (precTypeUpper.size () > 0) { - std::locale locale; for (size_t k = 0; k < precTypeUpper.size (); ++k) { - precTypeUpper[k] = std::toupper (precTypeUpper[k], locale); + precTypeUpper[k] = ::toupper(precTypeUpper[k]); } } diff --git a/packages/ifpack2/example/RelaxationWithEquilibration.cpp b/packages/ifpack2/example/RelaxationWithEquilibration.cpp index c045c27ed439..4eb218d7664b 100644 --- a/packages/ifpack2/example/RelaxationWithEquilibration.cpp +++ b/packages/ifpack2/example/RelaxationWithEquilibration.cpp @@ -913,26 +913,23 @@ densifyGatheredCrsMatrix (LO& errCode, { const LO numRows = LO (A.getRangeMap ()->getNodeNumElements ()); const LO numCols = LO (A.getDomainMap ()->getNodeNumElements ()); + using lids_type = typename Tpetra::CrsMatrix::local_inds_host_view_type; + using vals_type = typename Tpetra::CrsMatrix::values_host_view_type; using dense_matrix_type = HostDenseMatrix; dense_matrix_type A_dense (label, numRows, numCols); for (LO lclRow = 0; lclRow < numRows; ++lclRow) { - LO numEnt = 0; - const LO* lclColInds = nullptr; - const SC* vals = nullptr; - const LO curErrCode = A.getLocalRowView (lclRow, numEnt, vals, lclColInds); - if (errCode != 0) { - errCode = curErrCode; - } - else { - for (LO k = 0; k < numEnt; ++k) { - const LO lclCol = lclColInds[k]; - using impl_scalar_type = - typename Tpetra::CrsMatrix::impl_scalar_type; - A_dense(lclRow, lclCol) += impl_scalar_type (vals[k]); - } - } + lids_type lclColInds; + vals_type vals; + A.getLocalRowView (lclRow, lclColInds, vals); + LO numEnt = vals.size(); + for (LO k = 0; k < numEnt; ++k) { + const LO lclCol = lclColInds[k]; + using impl_scalar_type = + typename Tpetra::CrsMatrix::impl_scalar_type; + A_dense(lclRow, lclCol) += impl_scalar_type (vals[k]); + } } return A_dense; @@ -1125,8 +1122,8 @@ deepCopyFillCompleteCrsMatrix (const Tpetra::CrsMatrix& A) (! A.isFillComplete (), std::invalid_argument, "deepCopyFillCompleteCrsMatrix: Input matrix A must be fillComplete."); RCP A_copy (new crs_matrix_type (A.getCrsGraph ())); - auto A_copy_lcl = A_copy->getLocalMatrix (); - auto A_lcl = A.getLocalMatrix (); + auto A_copy_lcl = A_copy->getLocalMatrixDevice (); + auto A_lcl = A.getLocalMatrixDevice (); Kokkos::deep_copy (A_copy_lcl.values, A_lcl.values); A_copy->fillComplete (A.getDomainMap (), A.getRangeMap ()); return A_copy; diff --git a/packages/ifpack2/src/Ifpack2_BandedContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_BandedContainer_decl.hpp index d36b348160f9..6e01346cd9a8 100644 --- a/packages/ifpack2/src/Ifpack2_BandedContainer_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_BandedContainer_decl.hpp @@ -137,6 +137,7 @@ class BandedContainer using typename Container::HostView; using typename ContainerImpl::HostSubviewLocal; using typename ContainerImpl::ConstHostSubviewLocal; + using typename ContainerImpl::block_crs_matrix_type; using HostViewLocal = typename local_mv_type::dual_view_type::t_host; static_assert(std::is_sametranslateRowToCol(blockRows[j]); colToBlockOffset[localCol] = blockStart + j; } + + using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type; + using h_vals_type = typename block_crs_matrix_type::values_host_view_type; for(LO blockRow = 0; blockRow < LO(blockRows.size()); blockRow++) { //get a raw view of the whole block row - const LO* indices; - SC* values; - LO numEntries; + h_inds_type indices; + h_vals_type values; LO inputRow = this->blockRows_[blockStart + blockRow]; - this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values, numEntries); + this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values); + LO numEntries = (LO) indices.size(); for(LO k = 0; k < numEntries; k++) { LO colOffset = colToBlockOffset[indices[k]]; @@ -285,14 +288,16 @@ void BandedContainer::extract() LO localCol = this->translateRowToCol(blockRows[j]); colToBlockOffset[localCol] = blockStart + j; } + using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type; + using h_vals_type = typename block_crs_matrix_type::values_host_view_type; for(LO blockRow = 0; blockRow < LO(blockRows.size()); blockRow++) { //get a raw view of the whole block row - const LO* indices; - SC* values; - LO numEntries; + h_inds_type indices; + h_vals_type values; LO inputRow = this->blockRows_[blockStart + blockRow]; - this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values, numEntries); + this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values); + LO numEntries = (LO) indices.size(); for(LO k = 0; k < numEntries; k++) { LO colOffset = colToBlockOffset[indices[k]]; diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp index aad796ef8d8b..4e4e75aadaac 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp @@ -363,7 +363,7 @@ namespace Ifpack2 { typedef Tpetra::BlockCrsMatrix tpetra_block_crs_matrix_type; typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type; typedef Tpetra::BlockMultiVector tpetra_block_multivector_type; - typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_type local_crs_graph_type; + typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type; /// /// simd vectorization @@ -1563,10 +1563,10 @@ namespace Ifpack2 { // construct the D and R graphs in A = D + R. { - const auto& local_graph = g.getLocalGraph(); - const auto& local_graph_rowptr = local_graph.row_map; + const auto local_graph = g.getLocalGraphHost(); + const auto local_graph_rowptr = local_graph.row_map; TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast(nrows + 1)); - const auto& local_graph_colidx = local_graph.entries; + const auto local_graph_colidx = local_graph.entries; //assume no overlap. @@ -1783,8 +1783,8 @@ namespace Ifpack2 { } // Allocate or view values. - amd.tpetra_values = (const_cast(A.get())-> - template getValues()); + amd.tpetra_values = (const_cast(A.get())->getValuesDeviceNonConst()); + } } } @@ -1914,8 +1914,8 @@ namespace Ifpack2 { packptr(interf_.packptr), max_partsz(interf_.max_partsz), // block crs matrix - A_rowptr(A_->getCrsGraph().getLocalGraph().row_map), - A_values(const_cast(A_.get())->template getValues()), + A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map), + A_values(const_cast(A_.get())->getValuesDeviceNonConst()), // block tridiags pack_td_ptr(btdm_.pack_td_ptr), flat_td_ptr(btdm_.flat_td_ptr), @@ -3822,7 +3822,7 @@ namespace Ifpack2 { const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view; ComputeResidualVector - compute_residual_vector(amd, A->getCrsGraph().getLocalGraph(), blocksize, interf, + compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf, is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view); // norm manager workspace resize diff --git a/packages/ifpack2/src/Ifpack2_Container_decl.hpp b/packages/ifpack2/src/Ifpack2_Container_decl.hpp index 82063ab27de8..f43f89a6cf07 100644 --- a/packages/ifpack2/src/Ifpack2_Container_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_Container_decl.hpp @@ -537,25 +537,33 @@ namespace Details { { using SC = Scalar; using LO = LocalOrdinal; + + using block_crs_matrix_type = Tpetra::BlockCrsMatrix; + + using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type; + using h_vals_type = typename block_crs_matrix_type::values_host_view_type; //! Constructor for row views (preferred) - StridedRowView(const SC* vals_, const LO* inds_, int blockSize_, size_t nnz_); + StridedRowView(h_vals_type vals_, h_inds_type inds_, int blockSize_, size_t nnz_); + + //! Constructor for row views + // StridedRowView(const SC* vals_, const LO* inds_, int blockSize_, size_t nnz_); //! Constructor for deep copy (fallback, if matrix doesn't support row views) StridedRowView(Teuchos::Array& vals_, Teuchos::Array& inds_); - + SC val(size_t i) const; LO ind(size_t i) const; size_t size() const; private: - const SC* vals; - const LO* inds; - int blockSize; - size_t nnz; - //These arrays are only used if the inputMatrix_ doesn't support row views. - Teuchos::Array valsCopy; - Teuchos::Array indsCopy; + h_vals_type vals; + h_inds_type inds; + int blockSize; + size_t nnz; + //These arrays are only used if the inputMatrix_ doesn't support row views. + Teuchos::Array valsCopy; + Teuchos::Array indsCopy; }; } // namespace Details diff --git a/packages/ifpack2/src/Ifpack2_Container_def.hpp b/packages/ifpack2/src/Ifpack2_Container_def.hpp index b5c9b220a7af..04f693422b8c 100644 --- a/packages/ifpack2/src/Ifpack2_Container_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Container_def.hpp @@ -264,13 +264,15 @@ void ContainerImpl::DoGSBlock( //Use efficient blocked version ArrayView blockRows = this->getBlockRows(i); const size_t localNumRows = this->blockSizes_[i]; + using inds_type = typename block_crs_matrix_type::local_inds_host_view_type; + using vals_type = typename block_crs_matrix_type::values_host_view_type; for(size_t j = 0; j < localNumRows; j++) { LO row = blockRows[j]; // Containers_[i]->ID (j); - LO numEntries; - SC* values; - const LO* colinds; - this->inputBlockMatrix_->getLocalRowView(row, colinds, values, numEntries); + vals_type values; + inds_type colinds; + this->inputBlockMatrix_->getLocalRowView(row, colinds, values); + LO numEntries = (LO) colinds.size(); for(size_t m = 0; m < numVecs; m++) { for (int localR = 0; localR < this->bcrsBlockSize_; localR++) @@ -318,8 +320,8 @@ void ContainerImpl::DoGSBlock( //But, can only do this if the matrix is accessible directly from host, since it's not a DualView using container_exec_space = typename ContainerImpl::crs_matrix_type::execution_space; container_exec_space().fence(); - auto localA = this->inputCrsMatrix_->getLocalMatrix(); - using size_type = typename crs_matrix_type::local_matrix_type::size_type; + auto localA = this->inputCrsMatrix_->getLocalMatrixHost(); + using size_type = typename crs_matrix_type::local_matrix_host_type::size_type; const auto& rowmap = localA.graph.row_map; const auto& entries = localA.graph.entries; const auto& values = localA.values; @@ -844,33 +846,47 @@ Details::StridedRowView< typename ContainerImpl::NO> ContainerImpl:: getInputRowView(LO row) const -{ +{ + + typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type; + + typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type; + typedef typename MatrixType::values_host_view_type values_host_view_type; + using IST = typename row_matrix_type::impl_scalar_type; + if(this->hasBlockCrs_) { - const LO* colinds; - SC* values; - LO numEntries; - this->inputBlockMatrix_->getLocalRowView(row / this->bcrsBlockSize_, colinds, values, numEntries); - return StridedRowView(values + row % this->bcrsBlockSize_, colinds, this->bcrsBlockSize_, numEntries * this->bcrsBlockSize_); + using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type; + using h_vals_type = typename block_crs_matrix_type::values_host_view_type; + h_inds_type colinds; + h_vals_type values; + this->inputBlockMatrix_->getLocalRowView(row / this->bcrsBlockSize_, colinds, values); + size_t numEntries = colinds.size(); + // CMS: Can't say I understand what this really does + //return StridedRowView(values + row % this->bcrsBlockSize_, colinds, this->bcrsBlockSize_, numEntries * this->bcrsBlockSize_); + h_vals_type subvals = Kokkos::subview(values,std::pair(row % this->bcrsBlockSize_,values.size())); + return StridedRowView(subvals, colinds, this->bcrsBlockSize_, numEntries * this->bcrsBlockSize_); } else if(!this->inputMatrix_->supportsRowViews()) { size_t maxEntries = this->inputMatrix_->getNodeMaxNumRowEntries(); - Teuchos::Array indsCopy(maxEntries); - Teuchos::Array valsCopy(maxEntries); + Teuchos::Array inds(maxEntries); + Teuchos::Array vals(maxEntries); + nonconst_local_inds_host_view_type inds_v(inds.data(),maxEntries); + nonconst_values_host_view_type vals_v(reinterpret_cast(vals.data()),maxEntries); size_t numEntries; - this->inputMatrix_->getLocalRowCopy(row, indsCopy, valsCopy, numEntries); - indsCopy.resize(numEntries); - valsCopy.resize(numEntries); - return StridedRowView(valsCopy, indsCopy); + this->inputMatrix_->getLocalRowCopy(row, inds_v, vals_v, numEntries); + vals.resize(numEntries); inds.resize(numEntries); + return StridedRowView(vals, inds); } else { - const LO* colinds; - const SC* values; - LO numEntries; - this->inputMatrix_->getLocalRowViewRaw(row, numEntries, colinds, values); - return StridedRowView(values, colinds, 1, numEntries); + // CMS - This is dangerous and might not work. + local_inds_host_view_type colinds; + values_host_view_type values; + this->inputMatrix_->getLocalRowView(row, colinds, values); + return StridedRowView(values, colinds, 1, colinds.size()); } } @@ -890,14 +906,14 @@ namespace Details { //Implementation of Ifpack2::Details::StridedRowView template StridedRowView:: -StridedRowView(const SC* vals_, const LO* inds_, int blockSize_, size_t nnz_) +StridedRowView(h_vals_type vals_, h_inds_type inds_, int blockSize_, size_t nnz_) : vals(vals_), inds(inds_), blockSize(blockSize_), nnz(nnz_) {} template StridedRowView:: StridedRowView(Teuchos::Array& vals_, Teuchos::Array& inds_) - : vals(nullptr), inds(nullptr), blockSize(1), nnz(vals_.size()) + : vals(), inds(), blockSize(1), nnz(vals_.size()) { valsCopy.swap(vals_); indsCopy.swap(inds_); @@ -911,7 +927,7 @@ val(size_t i) const TEUCHOS_TEST_FOR_EXCEPTION(i >= nnz, std::runtime_error, "Out-of-bounds access into Ifpack2::Container::StridedRowView"); #endif - if(vals) + if(vals.size() > 0) { if(blockSize == 1) return vals[i]; @@ -931,7 +947,7 @@ ind(size_t i) const "Out-of-bounds access into Ifpack2::Container::StridedRowView"); #endif //inds is smaller than vals by a factor of the block size (dofs/node) - if(inds) + if(inds.size() > 0) { if(blockSize == 1) return inds[i]; diff --git a/packages/ifpack2/src/Ifpack2_DenseContainer_def.hpp b/packages/ifpack2/src/Ifpack2_DenseContainer_def.hpp index 058ffac13387..3c96bd09abc4 100644 --- a/packages/ifpack2/src/Ifpack2_DenseContainer_def.hpp +++ b/packages/ifpack2/src/Ifpack2_DenseContainer_def.hpp @@ -160,14 +160,16 @@ void DenseContainer::extract() LO localCol = this->translateRowToCol(blockRows[j]); colToBlockOffset[localCol] = blockStart + j; } + using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type; + using h_vals_type = typename block_crs_matrix_type::values_host_view_type; for(LO blockRow = 0; blockRow < LO(blockRows.size()); blockRow++) { //get a raw view of the whole block row - const LO* indices; - SC* values; - LO numEntries; + h_inds_type indices; + h_vals_type values; LO inputRow = this->blockRows_[blockStart + blockRow]; - this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values, numEntries); + this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values); + LO numEntries = (LO) indices.size(); for(LO k = 0; k < numEntries; k++) { LO colOffset = colToBlockOffset[indices[k]]; diff --git a/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp b/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp index c058c876c385..7ec68c63a709 100644 --- a/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp @@ -439,7 +439,7 @@ fusedCase (vector_type& W, using Impl::chebyshev_kernel_vector; using STS = Teuchos::ScalarTraits; - auto A_lcl = A.getLocalMatrix (); + auto A_lcl = A.getLocalMatrixDevice (); //D_inv, B, X and W are all Vectors, so it's safe to take the first column only auto Dinv_lcl = Kokkos::subview(D_inv.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); auto B_lcl = Kokkos::subview(B.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); diff --git a/packages/ifpack2/src/Ifpack2_Details_DenseSolver_decl.hpp b/packages/ifpack2/src/Ifpack2_Details_DenseSolver_decl.hpp index 03a05e654712..05190e78916a 100644 --- a/packages/ifpack2/src/Ifpack2_Details_DenseSolver_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_DenseSolver_decl.hpp @@ -125,6 +125,12 @@ class DenseSolver : static_assert(std::is_same::value, "Ifpack2::Details::DenseSolver: Please use MatrixType = Tpetra::RowMatrix."); + typedef typename row_matrix_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename row_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename row_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type; + + + //! Specialization of Tpetra::Map used by this class. typedef Tpetra::Map map_type; diff --git a/packages/ifpack2/src/Ifpack2_Details_DenseSolver_def.hpp b/packages/ifpack2/src/Ifpack2_Details_DenseSolver_def.hpp index 9bfea906cf1c..460529cb9097 100644 --- a/packages/ifpack2/src/Ifpack2_Details_DenseSolver_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_DenseSolver_def.hpp @@ -642,8 +642,8 @@ extract (Teuchos::SerialDenseMatrix& A_local_dense, // each row of A_local. const size_type maxNumRowEntries = static_cast (A_local.getNodeMaxNumRowEntries ()); - Array localIndices (maxNumRowEntries); - Array values (maxNumRowEntries); + nonconst_local_inds_host_view_type localIndices ("localIndices",maxNumRowEntries); + nonconst_values_host_view_type values ("values",maxNumRowEntries); const LO numLocalRows = static_cast (rowMap.getNodeNumElements ()); const LO minLocalRow = rowMap.getMinLocalIndex (); @@ -661,8 +661,8 @@ extract (Teuchos::SerialDenseMatrix& A_local_dense, static_cast (A_local.getNumEntriesInLocalRow (localRow)); size_t numEntriesOut = 0; // ignored A_local.getLocalRowCopy (localRow, - localIndices (0, numEntriesInRow), - values (0, numEntriesInRow), + localIndices, + values, numEntriesOut); for (LO k = 0; k < numEntriesInRow; ++k) { const LO localCol = localIndices[k]; diff --git a/packages/ifpack2/src/Ifpack2_Details_Factory_def.hpp b/packages/ifpack2/src/Ifpack2_Details_Factory_def.hpp index b62d22c58b8b..35aef8c8344c 100644 --- a/packages/ifpack2/src/Ifpack2_Details_Factory_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_Factory_def.hpp @@ -122,9 +122,8 @@ create (const std::string& precType, // precTypeUpper is the upper-case version of precType. std::string precTypeUpper (precType); if (precTypeUpper.size () > 0) { - std::locale locale; for (size_t k = 0; k < precTypeUpper.size (); ++k) { - precTypeUpper[k] = std::toupper (precTypeUpper[k], locale); + precTypeUpper[k] = ::toupper(precTypeUpper[k]); } } diff --git a/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp b/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp index 137323cd1b6b..23c2b71d4134 100644 --- a/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp @@ -51,14 +51,14 @@ namespace Details using crs_matrix_type = Tpetra::CrsMatrix; using bcrs_matrix_type = Tpetra::BlockCrsMatrix; using row_matrix_type = Tpetra::RowMatrix; - using local_matrix_type = typename crs_matrix_type::local_matrix_type; + using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type; using vector_type = Tpetra::Vector; using multivector_type = Tpetra::MultiVector; using block_multivector_type = Tpetra::BlockMultiVector; - using mem_space_t = typename local_matrix_type::memory_space; - using rowmap_t = typename local_matrix_type::row_map_type::HostMirror; - using entries_t = typename local_matrix_type::index_type::HostMirror; - using values_t = typename local_matrix_type::values_type::HostMirror; + using mem_space_t = typename local_matrix_device_type::memory_space; + using rowmap_t = typename local_matrix_device_type::row_map_type::HostMirror; + using entries_t = typename local_matrix_device_type::index_type::HostMirror; + using values_t = typename local_matrix_device_type::values_type::HostMirror; using Offset = typename rowmap_t::non_const_value_type; using IST = typename crs_matrix_type::impl_scalar_type; using KAT = Kokkos::ArithTraits; @@ -66,6 +66,10 @@ namespace Details using InverseBlocks = Kokkos::View; using InverseBlocksHost = typename InverseBlocks::HostMirror; + typedef typename crs_matrix_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename crs_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename crs_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type; + //Setup for CrsMatrix GaussSeidel(const crs_matrix_type& A, Teuchos::RCP& inverseDiagVec_, Teuchos::ArrayRCP& applyRows_, Scalar omega_) { @@ -74,7 +78,7 @@ namespace Details applyRows = applyRows_; blockSize = 1; omega = omega_; - auto Alocal = A.getLocalMatrix(); + auto Alocal = A.getLocalMatrixDevice(); Arowmap = Kokkos::create_mirror_view(Alocal.graph.row_map); Aentries = Kokkos::create_mirror_view(Alocal.graph.entries); Avalues = Kokkos::create_mirror_view(Alocal.values); @@ -95,8 +99,8 @@ namespace Details Aentries = entries_t(Kokkos::ViewAllocateWithoutInitializing("Aentries"), A.getNodeNumEntries()); Avalues = values_t(Kokkos::ViewAllocateWithoutInitializing("Avalues"), A.getNodeNumEntries()); size_t maxDegree = A.getNodeMaxNumRowEntries(); - Teuchos::Array rowValues(maxDegree); - Teuchos::Array rowEntries(maxDegree); + nonconst_values_host_view_type rowValues("rowValues",maxDegree); + nonconst_local_inds_host_view_type rowEntries("rowEntries",maxDegree); size_t accum = 0; for(LO i = 0; i <= numRows; i++) { @@ -104,7 +108,7 @@ namespace Details if(i == numRows) break; size_t degree; - A.getLocalRowCopy(i, rowEntries(), rowValues(), degree); + A.getLocalRowCopy(i, rowEntries, rowValues, degree); accum += degree; size_t rowBegin = Arowmap(i); for(size_t j = 0; j < degree; j++) @@ -123,9 +127,9 @@ namespace Details Kokkos::deep_copy(inverseBlockDiag, inverseBlockDiag_); applyRows = applyRows_; omega = omega_; - auto AlocalGraph = A.getCrsGraph().getLocalGraph(); + auto AlocalGraph = A.getCrsGraph().getLocalGraphDevice(); //A.sync_host(); //note: this only syncs values, not graph - Avalues = A.getValuesHost(); + Avalues = A.getValuesHostNonConst(); Arowmap = Kokkos::create_mirror_view(AlocalGraph.row_map); Aentries = Kokkos::create_mirror_view(AlocalGraph.entries); Kokkos::deep_copy(Arowmap, AlocalGraph.row_map); diff --git a/packages/ifpack2/src/Ifpack2_Details_InverseDiagonalKernel_def.hpp b/packages/ifpack2/src/Ifpack2_Details_InverseDiagonalKernel_def.hpp index 146f6635a74f..a6775a308b39 100644 --- a/packages/ifpack2/src/Ifpack2_Details_InverseDiagonalKernel_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_InverseDiagonalKernel_def.hpp @@ -197,7 +197,8 @@ compute (vector_type& D_inv, // Canonicalize template arguments to avoid redundant instantiations. using d_type = typename vector_type::dual_view_type::t_dev; - using matrix_type = typename crs_matrix_type::local_matrix_type; + // using h_matrix_type = typename crs_matrix_type::local_matrix_host_type; + using d_matrix_type = typename crs_matrix_type::local_matrix_device_type; const char kernel_label[] = "inverse_diagonal_kernel"; using execution_space = typename NT::execution_space; @@ -206,7 +207,7 @@ compute (vector_type& D_inv, auto policy = range_type(0, lclNumRows); d_type d = D_inv.getLocalViewDevice(Tpetra::Access::OverwriteAll); - matrix_type a = A_crs_->getLocalMatrix(); + d_matrix_type a = A_crs_->getLocalMatrixDevice(); if (do_l1) { constexpr bool do_l1_template = true; @@ -214,7 +215,7 @@ compute (vector_type& D_inv, constexpr bool fix_tiny_template = true; using functor_type = Impl::InverseDiagonalWithExtraction; @@ -224,7 +225,7 @@ compute (vector_type& D_inv, constexpr bool fix_tiny_template = false; using functor_type = Impl::InverseDiagonalWithExtraction; @@ -237,7 +238,7 @@ compute (vector_type& D_inv, constexpr bool fix_tiny_template = true; using functor_type = Impl::InverseDiagonalWithExtraction; @@ -247,7 +248,7 @@ compute (vector_type& D_inv, constexpr bool fix_tiny_template = false; using functor_type = Impl::InverseDiagonalWithExtraction; diff --git a/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_decl.hpp b/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_decl.hpp index f6386c941890..91550d9b075f 100644 --- a/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_decl.hpp @@ -70,11 +70,16 @@ class OverlappingRowGraph : typedef typename GraphType::local_ordinal_type local_ordinal_type; typedef typename GraphType::global_ordinal_type global_ordinal_type; typedef typename GraphType::node_type node_type; - + typedef typename GraphType::local_inds_host_view_type local_inds_host_view_type; + typedef typename GraphType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename GraphType::global_inds_host_view_type global_inds_host_view_type; + typedef typename GraphType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef Tpetra::Export export_type; typedef Tpetra::Import import_type; typedef Tpetra::Map map_type; typedef Tpetra::RowGraph row_graph_type; + //@} //! \name Constructors and destructor //@{ @@ -225,10 +230,18 @@ class OverlappingRowGraph : /// \c globalRow does not belong to this process, then \c indices is /// not modified and \c numIndices is set to /// Teuchos::OrdinalTraits::invalid() on output. + virtual void + getGlobalRowCopy (global_ordinal_type globalRow, + nonconst_global_inds_host_view_type& gblColInds, + size_t& numIndices) const; + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowCopy (global_ordinal_type globalRow, const Teuchos::ArrayView& indices, size_t& numIndices) const; +#endif + /// \brief Copy out a list of local column indices in the given /// local row that are owned by the calling process. @@ -247,10 +260,79 @@ class OverlappingRowGraph : /// localRow does not belong to this process, then /// indices is not modified and \c numIndices is set to /// Teuchos::OrdinalTraits::invalid() on output. + virtual void + getLocalRowCopy (local_ordinal_type localRow, + nonconst_local_inds_host_view_type& gblColInds, + size_t& numIndices) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowCopy (local_ordinal_type localRow, const Teuchos::ArrayView& indices, size_t& numIndices) const; +#endif + + /// \brief Get a constant, nonpersisting, locally indexed view of + /// the given row of the graph. + /// + /// The returned views of the column indices are not guaranteed to + /// persist beyond the lifetime of this. Furthermore, + /// some RowGraph implementations allow changing the values, or + /// the indices and values. Any such changes invalidate the + /// returned views. + /// + /// This method only gets the entries in the given row that are + /// stored on the calling process. Note that if the graph has an + /// overlapping row Map, it is possible that the calling process + /// does not store all the entries in that row. + /// + /// \pre isLocallyIndexed () && supportsRowViews () + /// \post indices.size () == getNumEntriesInGlobalRow (LocalRow) + /// + /// \param lclRow [in] Local index of the row. + /// \param lclColInds [out] Local indices of the columns in the + /// row. If the given row is not a valid row index on the + /// calling process, then the result has no entries (its size is + /// zero). + /// + /// Subclasses are expected to implement this method. We would + /// have made this method pure virtual, but that would have broken + /// backwards compatibility, since we added the method at least + /// one major release after introducing this class. + virtual void + getLocalRowView (const local_ordinal_type lclRow, + local_inds_host_view_type & lclColInds) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE + virtual void + getLocalRowView (const local_ordinal_type lclRow, + Teuchos::ArrayView& lclColInds) const; +#endif // TPETRA_ENABLE_DEPRECATED_CODE + + /// \brief Get a const, non-persisting view of the given global + /// row's global column indices, as a Teuchos::ArrayView. + /// + /// \param gblRow [in] Global index of the row. + /// \param gblColInds [out] Global column indices in the row. If + /// the given row is not a valid row index on the calling + /// process, then the result has no entries (its size is zero). + /// + /// \pre ! isLocallyIndexed() + /// \post gblColInds.size() == getNumEntriesInGlobalRow(gblRow) + /// + /// Subclasses are expected to implement this method. We would + /// have made this method pure virtual, but that would have broken + /// backwards compatibility, since we added the method at least + /// one major release after introducing this class. + virtual void + getGlobalRowView (const global_ordinal_type gblRow, + global_inds_host_view_type& gblColInds) const; + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE + virtual void + getGlobalRowView (const global_ordinal_type gblRow, + Teuchos::ArrayView& gblColInds) const; +#endif + + //@} private: //! \name Internal data diff --git a/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_def.hpp b/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_def.hpp index 39126127244a..d9367503f9b9 100644 --- a/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_def.hpp @@ -259,8 +259,27 @@ bool OverlappingRowGraph::isFillComplete () const { return true; } - + +template +void +OverlappingRowGraph:: + getGlobalRowCopy (global_ordinal_type globalRow, + nonconst_global_inds_host_view_type& indices, + size_t& numIndices) const +{ + const local_ordinal_type localRow = rowMap_->getLocalElement (globalRow); + if (localRow == Teuchos::OrdinalTraits::invalid ()) { + numIndices = Teuchos::OrdinalTraits::invalid (); + } else { + if (Teuchos::as (localRow) < nonoverlappingGraph_->getNodeNumRows ()) { + nonoverlappingGraph_->getGlobalRowCopy (globalRow, indices, numIndices); + } else { + overlappingGraph_->getGlobalRowCopy (globalRow, indices, numIndices); + } + } +} +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void OverlappingRowGraph:: @@ -279,13 +298,13 @@ getGlobalRowCopy (global_ordinal_type globalRow, } } } - +#endif template void OverlappingRowGraph:: -getLocalRowCopy (local_ordinal_type localRow, - const Teuchos::ArrayView& indices, +getLocalRowCopy (local_ordinal_type localRow, + nonconst_local_inds_host_view_type& indices, size_t& numIndices) const { using Teuchos::as; @@ -298,7 +317,101 @@ getLocalRowCopy (local_ordinal_type localRow, overlappingGraph_->getLocalRowCopy (localRowOffset, indices, numIndices); } } - + + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void +OverlappingRowGraph:: +getLocalRowCopy (local_ordinal_type localRow, + const Teuchos::ArrayView& indices, + size_t& numIndices) const +{ + using Teuchos::as; + const size_t numMyRowsA = nonoverlappingGraph_->getNodeNumRows (); + if (as (localRow) < numMyRowsA) { + nonoverlappingGraph_->getLocalRowCopy (localRow, indices, numIndices); + } else { + const local_ordinal_type localRowOffset = + localRow - as (numMyRowsA); + overlappingGraph_->getLocalRowCopy (localRowOffset, indices, numIndices); + } +} +#endif + +template +void +OverlappingRowGraph:: +getGlobalRowView (global_ordinal_type GlobalRow, + global_inds_host_view_type &indices) const { + const local_ordinal_type LocalRow = rowMap_->getLocalElement (GlobalRow); + if (LocalRow == Teuchos::OrdinalTraits::invalid()) { + indices = global_inds_host_view_type(); + } else { + if (Teuchos::as (LocalRow) < nonoverlappingGraph_->getNodeNumRows ()) { + nonoverlappingGraph_->getGlobalRowView (GlobalRow, indices); + } else { + overlappingGraph_->getGlobalRowView (GlobalRow, indices); + } + } +} + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void +OverlappingRowGraph:: +getGlobalRowView (global_ordinal_type GlobalRow, + Teuchos::ArrayView& indices) const +{ + const local_ordinal_type LocalRow = rowMap_->getLocalElement (GlobalRow); + if (LocalRow == Teuchos::OrdinalTraits::invalid()) { + indices = Teuchos::null; + } else { + if (Teuchos::as (LocalRow) < nonoverlappingGraph_->getNodeNumRows ()) { + nonoverlappingGraph_->getGlobalRowView (GlobalRow, indices); + } else { + overlappingGraph_->getGlobalRowView (GlobalRow, indices); + } + } +} +#endif + + +template +void +OverlappingRowGraph:: + getLocalRowView (local_ordinal_type LocalRow, + local_inds_host_view_type & indices) const { + using Teuchos::as; + const size_t numMyRowsA = nonoverlappingGraph_->getNodeNumRows (); + if (as (LocalRow) < numMyRowsA) { + nonoverlappingGraph_->getLocalRowView (LocalRow, indices); + } else { + overlappingGraph_->getLocalRowView (LocalRow - as (numMyRowsA), + indices); + } + +} + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void +OverlappingRowGraph:: +getLocalRowView (local_ordinal_type LocalRow, + Teuchos::ArrayView& indices) const +{ + using Teuchos::as; + const size_t numMyRowsA = nonoverlappingGraph_->getNodeNumRows (); + if (as (LocalRow) < numMyRowsA) { + nonoverlappingGraph_->getLocalRowView (LocalRow, indices); + } else { + overlappingGraph_->getLocalRowView (LocalRow - as (numMyRowsA), + indices); + } +} +#endif + + } // namespace Details } // namespace Ifpack2 diff --git a/packages/ifpack2/src/Ifpack2_Details_RowGraph.hpp b/packages/ifpack2/src/Ifpack2_Details_RowGraph.hpp index f9c0aa0866e6..862d828acc46 100644 --- a/packages/ifpack2/src/Ifpack2_Details_RowGraph.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_RowGraph.hpp @@ -70,7 +70,10 @@ class RowGraph : typedef typename GraphType::local_ordinal_type local_ordinal_type; typedef typename GraphType::global_ordinal_type global_ordinal_type; typedef typename GraphType::node_type node_type; - + typedef typename GraphType::local_inds_host_view_type local_inds_host_view_type; + typedef typename GraphType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename GraphType::global_inds_host_view_type global_inds_host_view_type; + typedef typename GraphType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; //@} //! \name Destructor //@{ diff --git a/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_def.hpp b/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_def.hpp index 79d14e65a9d3..8455f580e2b5 100644 --- a/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_def.hpp @@ -384,7 +384,7 @@ fusedCase (vector_type& W, using Impl::scaled_damped_residual_vector; using STS = Teuchos::ScalarTraits; - auto A_lcl = A.getLocalMatrix (); + auto A_lcl = A.getLocalMatrixDevice (); auto Dinv_lcl = Kokkos::subview(D_inv.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); auto B_lcl = Kokkos::subview(B.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); auto X_lcl = Kokkos::subview(X_colMap.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0); diff --git a/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_decl.hpp b/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_decl.hpp index 0f14676cab70..17798192c00f 100644 --- a/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_decl.hpp @@ -125,6 +125,10 @@ class TriDiSolver : static_assert(std::is_same::value, "Ifpack2::Details::TriDiSolver: The template parameter MatrixType must be a Tpetra::RowMatrix specialization. Please don't use Tpetra::CrsMatrix (a subclass of Tpetra::RowMatrix) here anymore. The constructor can take either a RowMatrix or a CrsMatrix just fine."); + typedef typename row_matrix_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename row_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename row_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type; + //! Specialization of Tpetra::Map used by this class. typedef Tpetra::Map map_type; diff --git a/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_def.hpp b/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_def.hpp index dcaf5e92c624..c843fea63cd5 100644 --- a/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_def.hpp @@ -629,8 +629,8 @@ void TriDiSolver::extract (Teuchos::SerialTriDiMatrix (A_local.getNodeMaxNumRowEntries ()); - Array localIndices (maxNumRowEntries); - Array values (maxNumRowEntries); + nonconst_local_inds_host_view_type localIndices("localIndices",maxNumRowEntries); + nonconst_values_host_view_type values ("values",maxNumRowEntries); const LO numLocalRows = static_cast (rowMap.getNodeNumElements ()); const LO minLocalRow = rowMap.getMinLocalIndex (); @@ -648,8 +648,8 @@ void TriDiSolver::extract (Teuchos::SerialTriDiMatrix (A_local.getNumEntriesInLocalRow (localRow)); size_t numEntriesOut = 0; // ignored A_local.getLocalRowCopy (localRow, - localIndices (0, numEntriesInRow), - values (0, numEntriesInRow), + localIndices, + values, numEntriesOut); for (LO k = 0; k < numEntriesInRow; ++k) { const LO localCol = localIndices[k]; diff --git a/packages/ifpack2/src/Ifpack2_DiagonalFilter_decl.hpp b/packages/ifpack2/src/Ifpack2_DiagonalFilter_decl.hpp index 88e972608b13..b3a9ca21d102 100644 --- a/packages/ifpack2/src/Ifpack2_DiagonalFilter_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_DiagonalFilter_decl.hpp @@ -79,9 +79,17 @@ class DiagonalFilter : typedef typename MatrixType::local_ordinal_type LocalOrdinal; typedef typename MatrixType::global_ordinal_type GlobalOrdinal; typedef typename MatrixType::node_type Node; - typedef typename Teuchos::ScalarTraits::magnitudeType magnitudeType; + typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type; + typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type; + typedef typename MatrixType::values_host_view_type values_host_view_type; + + typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type; - typedef typename Tpetra::RowMatrix::mag_type mag_type; + typedef typename Teuchos::ScalarTraits::magnitudeType magnitudeType; + typedef Tpetra::RowMatrix row_matrix_type; + typedef typename row_matrix_type::mag_type mag_type; static_assert(std::is_same >::value, "Ifpack2::DiagonalFilter: The template parameter MatrixType must be a Tpetra::RowMatrix specialization. Please don't use Tpetra::CrsMatrix (a subclass of Tpetra::RowMatrix) here anymore. The constructor can take either a RowMatrix or a CrsMatrix just fine."); @@ -186,10 +194,17 @@ class DiagonalFilter : with row \c GlobalRow. If \c GlobalRow does not belong to this node, then \c Indices and \c Values are unchanged and \c NumIndices is returned as Teuchos::OrdinalTraits::invalid(). */ + virtual void + getGlobalRowCopy (GlobalOrdinal GlobalRow, + nonconst_global_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const; +#endif //! Extract a list of entries in a specified local row of the graph. Put into storage allocated by calling routine. /*! @@ -202,11 +217,17 @@ class DiagonalFilter : with row \c LocalRow. If \c LocalRow is not valid for this node, then \c Indices and \c Values are unchanged and \c NumIndices is returned as Teuchos::OrdinalTraits::invalid(). */ + virtual void + getLocalRowCopy (LocalOrdinal LocalRow, + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowCopy(LocalOrdinal LocalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const ; - +#endif //! Extract a const, non-persisting view of global indices in a specified row of the matrix. /*! \param GlobalRow - (In) Global row number for which indices are desired. @@ -217,10 +238,15 @@ class DiagonalFilter : Note: If \c GlobalRow does not belong to this node, then \c indices is set to null. */ + virtual void + getGlobalRowView (GlobalOrdinal GlobalRow, + global_inds_host_view_type &indices, + values_host_view_type &values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowView(GlobalOrdinal GlobalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; - +#endif //! Extract a const, non-persisting view of local indices in a specified row of the matrix. /*! \param LocalRow - (In) Local row number for which indices are desired. @@ -231,10 +257,15 @@ class DiagonalFilter : Note: If \c LocalRow does not belong to this node, then \c indices is set to null. */ + virtual void + getLocalRowView (LocalOrdinal LocalRow, + local_inds_host_view_type & indices, + values_host_view_type & values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowView(LocalOrdinal LocalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; - +#endif //! \brief Get a copy of the diagonal entries owned by this node, with local row indices. /*! Returns a distributed Vector object partitioned according to this matrix's row map, containing the the zero and non-zero diagonals owned by this node. */ diff --git a/packages/ifpack2/src/Ifpack2_DiagonalFilter_def.hpp b/packages/ifpack2/src/Ifpack2_DiagonalFilter_def.hpp index 6e28f00c05e5..cf31bd69562b 100644 --- a/packages/ifpack2/src/Ifpack2_DiagonalFilter_def.hpp +++ b/packages/ifpack2/src/Ifpack2_DiagonalFilter_def.hpp @@ -65,8 +65,8 @@ DiagonalFilter (const Teuchos::RCP(A_->getRowMap())); - std::vector Indices(getNodeMaxNumRowEntries()); - std::vector Values(getNodeMaxNumRowEntries()); + nonconst_local_inds_host_view_type Indices("Indices",getNodeMaxNumRowEntries()); + nonconst_values_host_view_type Values("Values",getNodeMaxNumRowEntries()); size_t NumEntries; magnitudeType mysign; @@ -239,10 +239,11 @@ bool DiagonalFilter::isFillComplete() const template void DiagonalFilter:: -getGlobalRowCopy (GlobalOrdinal GlobalRow, - const Teuchos::ArrayView &Indices, - const Teuchos::ArrayView &Values, - size_t &NumEntries) const + getGlobalRowCopy (GlobalOrdinal GlobalRow, + nonconst_global_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const + { Teuchos::ArrayRCP< const Scalar > myvals=val_->get1dView(); LocalOrdinal LocalRow=getRowMap()->getLocalElement(GlobalRow); @@ -253,12 +254,27 @@ getGlobalRowCopy (GlobalOrdinal GlobalRow, Values[pos_[LocalRow]] += myvals[LocalRow]; } + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void DiagonalFilter:: -getLocalRowCopy (LocalOrdinal LocalRow, - const Teuchos::ArrayView &Indices, - const Teuchos::ArrayView &Values, - size_t &NumEntries) const +getGlobalRowCopy (GlobalOrdinal GlobalRow, + const Teuchos::ArrayView &Indices, + const Teuchos::ArrayView &Values, + size_t &NumEntries) const { + using IST = typename row_matrix_type::impl_scalar_type; + nonconst_global_inds_host_view_type ind_in(Indices.data(),Indices.size()); + nonconst_values_host_view_type val_in(reinterpret_cast(Values.data()),Values.size()); + getGlobalRowCopy(GlobalRow,ind_in,val_in,NumEntries); +} +#endif + +template +void DiagonalFilter:: + getLocalRowCopy (LocalOrdinal LocalRow, + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const { Teuchos::ArrayRCP< const Scalar > myvals=val_->get1dView(); @@ -268,6 +284,32 @@ getLocalRowCopy (LocalOrdinal LocalRow, Values[pos_[LocalRow]] += myvals[LocalRow]; } +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void +DiagonalFilter:: +getLocalRowCopy (LocalOrdinal LocalRow, + const Teuchos::ArrayView &Indices, + const Teuchos::ArrayView &Values, + size_t &NumEntries) const +{ + using IST = typename row_matrix_type::impl_scalar_type; + nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size()); + nonconst_values_host_view_type val_in(reinterpret_cast(Values.data()),Values.size()); + getLocalRowCopy(LocalRow,ind_in,val_in,NumEntries); +} +#endif + + +template +void DiagonalFilter::getGlobalRowView(GlobalOrdinal /* GlobalRow */, + global_inds_host_view_type &/*indices*/, + values_host_view_type &/*values*/) const +{ + throw std::runtime_error("Ifpack2::DiagonalFilter: does not support getGlobalRowView."); +} + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void DiagonalFilter:: getGlobalRowView (GlobalOrdinal /* GlobalRow */, @@ -276,7 +318,17 @@ getGlobalRowView (GlobalOrdinal /* GlobalRow */, { throw std::runtime_error("Ifpack2::DiagonalFilter: does not support getGlobalRowView."); } +#endif +template +void DiagonalFilter::getLocalRowView(LocalOrdinal /* LocalRow */, + local_inds_host_view_type & /*indices*/, + values_host_view_type & /*values*/) const +{ + throw std::runtime_error("Ifpack2::DiagonalFilter: does not support getLocalRowView."); +} + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void DiagonalFilter:: getLocalRowView (LocalOrdinal /* LocalRow */, @@ -285,6 +337,7 @@ getLocalRowView (LocalOrdinal /* LocalRow */, { throw std::runtime_error("Ifpack2::DiagonalFilter: does not support getLocalRowView."); } +#endif template void DiagonalFilter::getLocalDiagCopy(Tpetra::Vector &diag) const diff --git a/packages/ifpack2/src/Ifpack2_DropFilter_decl.hpp b/packages/ifpack2/src/Ifpack2_DropFilter_decl.hpp index 932184265b1c..263dba73d13b 100644 --- a/packages/ifpack2/src/Ifpack2_DropFilter_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_DropFilter_decl.hpp @@ -81,9 +81,19 @@ class DropFilter : typedef typename MatrixType::local_ordinal_type LocalOrdinal; typedef typename MatrixType::global_ordinal_type GlobalOrdinal; typedef typename MatrixType::node_type Node; + typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type; + typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type; + typedef typename MatrixType::values_host_view_type values_host_view_type; + + typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type; + typedef typename Teuchos::ScalarTraits::magnitudeType magnitudeType; - typedef typename Tpetra::RowMatrix::mag_type mag_type; + typedef Tpetra::RowMatrix row_matrix_type; + typedef typename row_matrix_type::mag_type mag_type; + static_assert(std::is_same >::value, "Ifpack2::DropFilter: The template parameter MatrixType must be a Tpetra::RowMatrix specialization. Please don't use Tpetra::CrsMatrix (a subclass of Tpetra::RowMatrix) here anymore. The constructor can take either a RowMatrix or a CrsMatrix just fine."); @@ -186,10 +196,17 @@ class DropFilter : with row \c GlobalRow. If \c GlobalRow does not belong to this node, then \c Indices and \c Values are unchanged and \c NumIndices is returned as Teuchos::OrdinalTraits::invalid(). */ + virtual void + getGlobalRowCopy (GlobalOrdinal GlobalRow, + nonconst_global_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const; +#endif //! Extract a list of entries in a specified local row of the graph. Put into storage allocated by calling routine. /*! @@ -202,11 +219,17 @@ class DropFilter : with row \c DropRow. If \c DropRow is not valid for this node, then \c Indices and \c Values are unchanged and \c NumIndices is returned as Teuchos::OrdinalTraits::invalid(). */ + virtual void + getLocalRowCopy (LocalOrdinal DropRow, + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowCopy(LocalOrdinal DropRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const ; - +#endif //! Extract a const, non-persisting view of global indices in a specified row of the matrix. /*! \param GlobalRow - (In) Global row number for which indices are desired. @@ -217,10 +240,15 @@ class DropFilter : Note: If \c GlobalRow does not belong to this node, then \c indices is set to null. */ + virtual void + getGlobalRowView (GlobalOrdinal GlobalRow, + global_inds_host_view_type &indices, + values_host_view_type &values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowView(GlobalOrdinal GlobalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; - +#endif //! Extract a const, non-persisting view of local indices in a specified row of the matrix. /*! \param DropRow - (In) Drop row number for which indices are desired. @@ -231,10 +259,15 @@ class DropFilter : Note: If \c DropRow does not belong to this node, then \c indices is set to null. */ + virtual void + getLocalRowView (LocalOrdinal LocalRow, + local_inds_host_view_type & indices, + values_host_view_type & values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowView(LocalOrdinal DropRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; - +#endif //! \brief Get a copy of the diagonal entries owned by this node, with local row indices. /*! Returns a distributed Vector object partitioned according to this matrix's row map, containing the the zero and non-zero diagonals owned by this node. */ @@ -307,9 +340,9 @@ class DropFilter : //! NumEntries_[i] contains the nonzero entries in row `i'. std::vector NumEntries_; //! Used in ExtractMyRowCopy, to avoid allocation each time. - mutable Teuchos::Array Indices_; - //! Used in ExtractMyRowCopy, to avoid allocation each time. - mutable Teuchos::Array Values_; + mutable nonconst_local_inds_host_view_type Indices_; + //! Used in ExtractMyRowCopy, to avoid allocation each time + mutable nonconst_values_host_view_type Values_; }; diff --git a/packages/ifpack2/src/Ifpack2_DropFilter_def.hpp b/packages/ifpack2/src/Ifpack2_DropFilter_def.hpp index 32c19689b822..b602141c0374 100644 --- a/packages/ifpack2/src/Ifpack2_DropFilter_def.hpp +++ b/packages/ifpack2/src/Ifpack2_DropFilter_def.hpp @@ -85,8 +85,8 @@ DropFilter::DropFilter(const Teuchos::RCPgetNodeMaxNumRowEntries(); // ExtractMyRowCopy() will use these vectors - Indices_.resize(MaxNumEntries_); - Values_.resize(MaxNumEntries_); + Kokkos::resize(Indices_,MaxNumEntries_); + Kokkos::resize(Values_,MaxNumEntries_); size_t ActualMaxNumEntries = 0; for (size_t i = 0 ; i < NumRows_ ; ++i) { @@ -278,6 +278,18 @@ bool DropFilter::isFillComplete() const //========================================================================== template +void DropFilter:: +getGlobalRowCopy (GlobalOrdinal /*GlobalRow*/, + nonconst_global_inds_host_view_type &/*Indices*/, + nonconst_values_host_view_type &/*Values*/, + size_t& /*NumEntries*/) const +{ + throw std::runtime_error("Ifpack2::DropFilter does not implement getGlobalRowCopy."); +} + +//========================================================================== +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template void DropFilter::getGlobalRowCopy(GlobalOrdinal /* GlobalRow */, const Teuchos::ArrayView &/* Indices */, const Teuchos::ArrayView &/* Values */, @@ -285,13 +297,15 @@ void DropFilter::getGlobalRowCopy(GlobalOrdinal /* GlobalRow */, { throw std::runtime_error("Ifpack2::DropFilter does not implement getGlobalRowCopy."); } +#endif //========================================================================== template -void DropFilter::getLocalRowCopy(LocalOrdinal LocalRow, - const Teuchos::ArrayView &Indices, - const Teuchos::ArrayView &Values, - size_t &NumEntries) const +void DropFilter:: + getLocalRowCopy (LocalOrdinal LocalRow, + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const { TEUCHOS_TEST_FOR_EXCEPTION((LocalRow < 0 || (size_t) LocalRow >= NumRows_ || (size_t) Indices.size() < NumEntries_[LocalRow]), std::runtime_error, "Ifpack2::DropFilter::getLocalRowCopy invalid row or array size."); @@ -302,7 +316,7 @@ void DropFilter::getLocalRowCopy(LocalOrdinal LocalRow, // This is because I need more space than that given by // the user (for the external nodes) size_t A_NumEntries=0; - A_->getLocalRowCopy(LocalRow,Indices_(),Values_(),A_NumEntries); + A_->getLocalRowCopy(LocalRow,Indices_,Values_,A_NumEntries); // loop over all nonzero elements of row MyRow, // and drop elements below specified threshold. @@ -321,6 +335,30 @@ void DropFilter::getLocalRowCopy(LocalOrdinal LocalRow, } //========================================================================== +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void DropFilter::getLocalRowCopy(LocalOrdinal LocalRow, + const Teuchos::ArrayView &Indices, + const Teuchos::ArrayView &Values, + size_t &NumEntries) const +{ + using IST = typename row_matrix_type::impl_scalar_type; + nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size()); + nonconst_values_host_view_type val_in(reinterpret_cast(Values.data()),Values.size()); + getLocalRowCopy(LocalRow,ind_in,val_in,NumEntries); +} +#endif + +//========================================================================== +template +void DropFilter::getGlobalRowView(GlobalOrdinal /* GlobalRow */, + global_inds_host_view_type &/*indices*/, + values_host_view_type &/*values*/) const +{ + throw std::runtime_error("Ifpack2::DropFilter: does not support getGlobalRowView."); +} + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void DropFilter::getGlobalRowView(GlobalOrdinal /* GlobalRow */, Teuchos::ArrayView &/* indices */, @@ -328,15 +366,25 @@ void DropFilter::getGlobalRowView(GlobalOrdinal /* GlobalRow */, { throw std::runtime_error("Ifpack2::DropFilter: does not support getGlobalRowView."); } - +#endif //========================================================================== template +void DropFilter::getLocalRowView(LocalOrdinal /* LocalRow */, + local_inds_host_view_type & /*indices*/, + values_host_view_type & /*values*/) const +{ + throw std::runtime_error("Ifpack2::DropFilter: does not support getLocalRowView."); +} + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template void DropFilter::getLocalRowView(LocalOrdinal /* LocalRow */, Teuchos::ArrayView &/* indices */, Teuchos::ArrayView &/* values */) const { throw std::runtime_error("Ifpack2::DropFilter: does not support getLocalRowView."); } +#endif //========================================================================== template @@ -383,21 +431,22 @@ void DropFilter::apply(const Tpetra::MultiVector(Values_.data()); if (mode==Teuchos::NO_TRANS){ for (size_t j = 0 ; j < Nnz ; ++j) for (size_t k = 0 ; k < NumVectors ; ++k) - y_ptr[k][i] += Values_[j] * x_ptr[k][Indices_[j]]; + y_ptr[k][i] += Values[j] * x_ptr[k][Indices_[j]]; } else if (mode==Teuchos::TRANS){ for (size_t j = 0 ; j < Nnz ; ++j) for (size_t k = 0 ; k < NumVectors ; ++k) - y_ptr[k][Indices_[j]] += Values_[j] * x_ptr[k][i]; + y_ptr[k][Indices_[j]] += Values[j] * x_ptr[k][i]; } else { //mode==Teuchos::CONJ_TRANS for (size_t j = 0 ; j < Nnz ; ++j) for (size_t k = 0 ; k < NumVectors ; ++k) - y_ptr[k][Indices_[j]] += Teuchos::ScalarTraits::conjugate(Values_[j]) * x_ptr[k][i]; + y_ptr[k][Indices_[j]] += Teuchos::ScalarTraits::conjugate(Values[j]) * x_ptr[k][i]; } } } diff --git a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp index 2cf6a45defb6..f1dd7af784dc 100644 --- a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp @@ -140,9 +140,11 @@ class RBILUK : virtual public Ifpack2::RILUK< Tpetra::RowMatrix< typename Matrix //! The type of local indices in the input MatrixType. typedef typename MatrixType::local_ordinal_type local_ordinal_type; + typedef typename MatrixType::local_ordinal_type LO; //! The type of global indices in the input MatrixType. typedef typename MatrixType::global_ordinal_type global_ordinal_type; + typedef typename MatrixType::global_ordinal_type GO; //! The Node type used by the input MatrixType. typedef typename MatrixType::node_type node_type; @@ -173,13 +175,13 @@ class RBILUK : virtual public Ifpack2::RILUK< Tpetra::RowMatrix< typename Matrix //! \name Implementation of KK ILU(k). //@{ - typedef typename crs_matrix_type::local_matrix_type local_matrix_type; - typedef typename local_matrix_type::StaticCrsGraphType::row_map_type lno_row_view_t; - typedef typename local_matrix_type::StaticCrsGraphType::entries_type lno_nonzero_view_t; - typedef typename local_matrix_type::values_type scalar_nonzero_view_t; - typedef typename local_matrix_type::StaticCrsGraphType::device_type::memory_space TemporaryMemorySpace; - typedef typename local_matrix_type::StaticCrsGraphType::device_type::memory_space PersistentMemorySpace; - typedef typename local_matrix_type::StaticCrsGraphType::device_type::execution_space HandleExecSpace; + typedef typename crs_matrix_type::local_matrix_device_type local_matrix_device_type; + typedef typename local_matrix_device_type::StaticCrsGraphType::row_map_type lno_row_view_t; + typedef typename local_matrix_device_type::StaticCrsGraphType::entries_type lno_nonzero_view_t; + typedef typename local_matrix_device_type::values_type scalar_nonzero_view_t; + typedef typename local_matrix_device_type::StaticCrsGraphType::device_type::memory_space TemporaryMemorySpace; + typedef typename local_matrix_device_type::StaticCrsGraphType::device_type::memory_space PersistentMemorySpace; + typedef typename local_matrix_device_type::StaticCrsGraphType::device_type::execution_space HandleExecSpace; typedef typename KokkosKernels::Experimental::KokkosKernelsHandle kk_handle_type; @@ -333,10 +335,16 @@ class RBILUK : virtual public Ifpack2::RILUK< Tpetra::RowMatrix< typename Matrix typedef Teuchos::ScalarTraits STS; typedef Teuchos::ScalarTraits STM; typedef typename block_crs_matrix_type::little_block_type little_block_type; + typedef typename block_crs_matrix_type::little_block_host_type little_block_host_type; typedef typename block_crs_matrix_type::little_vec_type little_vec_type; typedef typename block_crs_matrix_type::little_host_vec_type little_host_vec_type; typedef typename block_crs_matrix_type::const_host_little_vec_type const_host_little_vec_type; + using local_inds_host_view_type = typename block_crs_matrix_type::local_inds_host_view_type; + using values_host_view_type = typename block_crs_matrix_type::values_host_view_type; + using local_inds_device_view_type = typename block_crs_matrix_type::local_inds_device_view_type; + using values_device_view_type = typename block_crs_matrix_type::values_device_view_type; + void allocate_L_and_U_blocks(); void initAllValues (const block_crs_matrix_type& A); diff --git a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp index 4dd78f5b66d4..e11343e2d626 100644 --- a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp @@ -277,22 +277,22 @@ RBILUK:: initAllValues (const block_crs_matrix_type& A) { using Teuchos::RCP; - typedef Tpetra::Map map_type; + typedef Tpetra::Map map_type; - local_ordinal_type NumIn = 0, NumL = 0, NumU = 0; + LO NumIn = 0, NumL = 0, NumU = 0; bool DiagFound = false; size_t NumNonzeroDiags = 0; size_t MaxNumEntries = A.getNodeMaxNumRowEntries(); - local_ordinal_type blockMatSize = blockSize_*blockSize_; + LO blockMatSize = blockSize_*blockSize_; // First check that the local row map ordering is the same as the local portion of the column map. // The extraction of the strictly lower/upper parts of A, as well as the factorization, // implicitly assume that this is the case. - Teuchos::ArrayView rowGIDs = A.getRowMap()->getNodeElementList(); - Teuchos::ArrayView colGIDs = A.getColMap()->getNodeElementList(); + Teuchos::ArrayView rowGIDs = A.getRowMap()->getNodeElementList(); + Teuchos::ArrayView colGIDs = A.getColMap()->getNodeElementList(); bool gidsAreConsistentlyOrdered=true; - global_ordinal_type indexOfInconsistentGID=0; - for (global_ordinal_type i=0; i LI(MaxNumEntries); - Teuchos::Array UI(MaxNumEntries); + Teuchos::Array LI(MaxNumEntries); + Teuchos::Array UI(MaxNumEntries); Teuchos::Array LV(MaxNumEntries*blockMatSize); Teuchos::Array UV(MaxNumEntries*blockMatSize); @@ -322,6 +322,7 @@ initAllValues (const block_crs_matrix_type& A) // host, so sync to host first. The const_cast is unfortunate but // is our only option to make this correct. + /* const_cast (A).sync_host (); L_block_->sync_host (); U_block_->sync_host (); @@ -330,6 +331,7 @@ initAllValues (const block_crs_matrix_type& A) L_block_->modify_host (); U_block_->modify_host (); D_block_->modify_host (); + */ RCP rowMap = L_block_->getRowMap (); @@ -343,14 +345,17 @@ initAllValues (const block_crs_matrix_type& A) //TODO BMK: Revisit this fence when BlockCrsMatrix is refactored. Kokkos::fence(); + using inds_type = typename row_matrix_type::local_inds_host_view_type; + using vals_type = typename row_matrix_type::values_host_view_type; for (size_t myRow=0; myRowRthresh_ * InV[blockOffset+jj] + IFPACK2_SGN(InV[blockOffset+jj]) * this->Athresh_; D_block_->replaceLocalValues(local_row, &InI[j], diagValues.getRawPtr(), 1); } @@ -380,15 +385,15 @@ initAllValues (const block_crs_matrix_type& A) } else if (k < local_row) { LI[NumL] = k; - const local_ordinal_type LBlockOffset = NumL*blockMatSize; - for (local_ordinal_type jj = 0; jj < blockMatSize; ++jj) + const LO LBlockOffset = NumL*blockMatSize; + for (LO jj = 0; jj < blockMatSize; ++jj) LV[LBlockOffset+jj] = InV[blockOffset+jj]; NumL++; } else if (Teuchos::as(k) <= rowMap->getNodeNumElements()) { UI[NumU] = k; - const local_ordinal_type UBlockOffset = NumU*blockMatSize; - for (local_ordinal_type jj = 0; jj < blockMatSize; ++jj) + const LO UBlockOffset = NumU*blockMatSize; + for (LO jj = 0; jj < blockMatSize; ++jj) UV[UBlockOffset+jj] = InV[blockOffset+jj]; NumU++; } @@ -400,7 +405,7 @@ initAllValues (const block_crs_matrix_type& A) ++NumNonzeroDiags; } else { - for (local_ordinal_type jj = 0; jj < blockSize_; ++jj) + for (LO jj = 0; jj < blockSize_; ++jj) diagValues[jj*(blockSize_+1)] = this->Athresh_; D_block_->replaceLocalValues(local_row, &local_row, diagValues.getRawPtr(), 1); } @@ -416,6 +421,7 @@ initAllValues (const block_crs_matrix_type& A) // NOTE (mfh 27 May 2016) Sync back to device, in case compute() // ever gets a device implementation. + /* { typedef typename block_crs_matrix_type::device_type device_type; const_cast (A).template sync (); @@ -423,6 +429,7 @@ initAllValues (const block_crs_matrix_type& A) U_block_->template sync (); D_block_->template sync (); } + */ this->isInitialized_ = true; } @@ -430,7 +437,7 @@ namespace { // (anonymous) // For a given Kokkos::View type, possibly unmanaged, get the // corresponding managed Kokkos::View type. This is handy for -// translating from little_block_type or little_vec_type (both +// translating from little_block_type or little_host_vec_type (both // possibly unmanaged) to their managed versions. template struct GetManagedView { @@ -477,8 +484,9 @@ void RBILUK::compute () if (! A_block_.is_null ()) { Teuchos::RCP A_nc = Teuchos::rcp_const_cast (A_block_); - A_nc->sync_host (); + // A_nc->sync_host (); } + /* L_block_->sync_host (); U_block_->sync_host (); D_block_->sync_host (); @@ -486,6 +494,7 @@ void RBILUK::compute () L_block_->modify_host (); U_block_->modify_host (); D_block_->modify_host (); + */ Teuchos::Time timer ("RBILUK::compute"); double startTime = timer.wallTime(); @@ -501,18 +510,18 @@ void RBILUK::compute () initAllValues (*A_block_); size_t NumIn; - local_ordinal_type NumL, NumU, NumURead; + LO NumL, NumU, NumURead; // Get Maximum Row length const size_t MaxNumEntries = L_block_->getNodeMaxNumRowEntries () + U_block_->getNodeMaxNumRowEntries () + 1; - const local_ordinal_type blockMatSize = blockSize_*blockSize_; + const LO blockMatSize = blockSize_*blockSize_; // FIXME (mfh 08 Nov 2015, 24 May 2016) We need to move away from // expressing these strides explicitly, in order to finish #177 // (complete Kokkos-ization of BlockCrsMatrix) thoroughly. - const local_ordinal_type rowStride = blockSize_; + const LO rowStride = blockSize_; Teuchos::Array ipiv_teuchos(blockSize_); Kokkos::View::compute () size_t num_cols = U_block_->getColMap()->getNodeNumElements(); Teuchos::Array colflag(num_cols); - typename GetManagedView::managed_non_const_type diagModBlock ("diagModBlock", blockSize_, blockSize_); - typename GetManagedView::managed_non_const_type matTmp ("matTmp", blockSize_, blockSize_); - typename GetManagedView::managed_non_const_type multiplier ("multiplier", blockSize_, blockSize_); + typename GetManagedView::managed_non_const_type diagModBlock ("diagModBlock", blockSize_, blockSize_); + typename GetManagedView::managed_non_const_type matTmp ("matTmp", blockSize_, blockSize_); + typename GetManagedView::managed_non_const_type multiplier ("multiplier", blockSize_, blockSize_); // Teuchos::ArrayRCP DV = D_->get1dViewNonConst(); // Get view of diagonal // Now start the factorization. // Need some integer workspace and pointers - local_ordinal_type NumUU; + LO NumUU; for (size_t j = 0; j < num_cols; ++j) { colflag[j] = -1; } - Teuchos::Array InI(MaxNumEntries, 0); + Teuchos::Array InI(MaxNumEntries, 0); Teuchos::Array InV(MaxNumEntries*blockMatSize,STM::zero()); - const local_ordinal_type numLocalRows = L_block_->getNodeNumRows (); - for (local_ordinal_type local_row = 0; local_row < numLocalRows; ++local_row) { + const LO numLocalRows = L_block_->getNodeNumRows (); + for (LO local_row = 0; local_row < numLocalRows; ++local_row) { // Fill InV, InI with current row of L, D and U combined NumIn = MaxNumEntries; - const local_ordinal_type * colValsL; - scalar_type * valsL; + local_inds_host_view_type colValsL; + values_host_view_type valsL; - L_block_->getLocalRowView(local_row, colValsL, valsL, NumL); - for (local_ordinal_type j = 0; j < NumL; ++j) + L_block_->getLocalRowView(local_row, colValsL, valsL); + NumL = (LO) colValsL.size(); + for (LO j = 0; j < NumL; ++j) { - const local_ordinal_type matOffset = blockMatSize*j; - little_block_type lmat((typename little_block_type::value_type*) &valsL[matOffset],blockSize_,rowStride); - little_block_type lmatV((typename little_block_type::value_type*) &InV[matOffset],blockSize_,rowStride); + const LO matOffset = blockMatSize*j; + little_block_host_type lmat((typename little_block_host_type::value_type*) &valsL[matOffset],blockSize_,rowStride); + little_block_host_type lmatV((typename little_block_host_type::value_type*) &InV[matOffset],blockSize_,rowStride); //lmatV.assign(lmat); Tpetra::COPY (lmat, lmatV); InI[j] = colValsL[j]; } - little_block_type dmat = D_block_->getLocalBlock(local_row, local_row); - little_block_type dmatV((typename little_block_type::value_type*) &InV[NumL*blockMatSize], blockSize_, rowStride); + little_block_host_type dmat = D_block_->getLocalBlockHostNonConst(local_row, local_row); + little_block_host_type dmatV((typename little_block_host_type::value_type*) &InV[NumL*blockMatSize], blockSize_, rowStride); //dmatV.assign(dmat); Tpetra::COPY (dmat, dmatV); InI[NumL] = local_row; - const local_ordinal_type * colValsU; - scalar_type * valsU; - U_block_->getLocalRowView(local_row, colValsU, valsU, NumURead); + local_inds_host_view_type colValsU; + values_host_view_type valsU; + U_block_->getLocalRowView(local_row, colValsU, valsU); + NumURead = (LO) colValsU.size(); NumU = 0; - for (local_ordinal_type j = 0; j < NumURead; ++j) + for (LO j = 0; j < NumURead; ++j) { if (!(colValsU[j] < numLocalRows)) continue; InI[NumL+1+j] = colValsU[j]; - const local_ordinal_type matOffset = blockMatSize*(NumL+1+j); - little_block_type umat((typename little_block_type::value_type*) &valsU[blockMatSize*j], blockSize_, rowStride); - little_block_type umatV((typename little_block_type::value_type*) &InV[matOffset], blockSize_, rowStride); + const LO matOffset = blockMatSize*(NumL+1+j); + little_block_host_type umat((typename little_block_host_type::value_type*) &valsU[blockMatSize*j], blockSize_, rowStride); + little_block_host_type umatV((typename little_block_host_type::value_type*) &InV[matOffset], blockSize_, rowStride); //umatV.assign(umat); Tpetra::COPY (umat, umatV); NumU += 1; @@ -589,8 +600,8 @@ void RBILUK::compute () } #ifndef IFPACK2_RBILUK_INITIAL - for (local_ordinal_type i = 0; i < blockSize_; ++i) - for (local_ordinal_type j = 0; j < blockSize_; ++j){ + for (LO i = 0; i < blockSize_; ++i) + for (LO j = 0; j < blockSize_; ++j){ { diagModBlock(i,j) = 0; } @@ -600,13 +611,13 @@ void RBILUK::compute () Kokkos::deep_copy (diagModBlock, diagmod); #endif - for (local_ordinal_type jj = 0; jj < NumL; ++jj) { - local_ordinal_type j = InI[jj]; - little_block_type currentVal((typename little_block_type::value_type*) &InV[jj*blockMatSize], blockSize_, rowStride); // current_mults++; + for (LO jj = 0; jj < NumL; ++jj) { + LO j = InI[jj]; + little_block_host_type currentVal((typename little_block_host_type::value_type*) &InV[jj*blockMatSize], blockSize_, rowStride); // current_mults++; //multiplier.assign(currentVal); Tpetra::COPY (currentVal, multiplier); - const little_block_type dmatInverse = D_block_->getLocalBlock(j,j); + const little_block_host_type dmatInverse = D_block_->getLocalBlockHostNonConst(j,j); // alpha = 1, beta = 0 #ifndef IFPACK2_RBILUK_INITIAL_NOKK KokkosBatched::Experimental::SerialGemm @@ -621,18 +632,19 @@ void RBILUK::compute () //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast (currentVal.data ()), reinterpret_cast (dmatInverse.data ()), reinterpret_cast (matTmp.data ()), blockSize_); //currentVal.assign(matTmp); Tpetra::COPY (matTmp, currentVal); + local_inds_host_view_type UUI; + values_host_view_type UUV; - const local_ordinal_type * UUI; - scalar_type * UUV; - U_block_->getLocalRowView(j, UUI, UUV, NumUU); + U_block_->getLocalRowView(j, UUI, UUV); + NumUU = (LO) UUI.size(); if (this->RelaxValue_ == STM::zero ()) { - for (local_ordinal_type k = 0; k < NumUU; ++k) { + for (LO k = 0; k < NumUU; ++k) { if (!(UUI[k] < numLocalRows)) continue; const int kk = colflag[UUI[k]]; if (kk > -1) { - little_block_type kkval((typename little_block_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride); - little_block_type uumat((typename little_block_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride); + little_block_host_type kkval((typename little_block_host_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride); + little_block_host_type uumat((typename little_block_host_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride); #ifndef IFPACK2_RBILUK_INITIAL_NOKK KokkosBatched::Experimental::SerialGemm ::compute () } } else { - for (local_ordinal_type k = 0; k < NumUU; ++k) { + for (LO k = 0; k < NumUU; ++k) { if (!(UUI[k] < numLocalRows)) continue; const int kk = colflag[UUI[k]]; - little_block_type uumat((typename little_block_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride); + little_block_host_type uumat((typename little_block_host_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride); if (kk > -1) { - little_block_type kkval((typename little_block_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride); + little_block_host_type kkval((typename little_block_host_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride); #ifndef IFPACK2_RBILUK_INITIAL_NOKK KokkosBatched::Experimental::SerialGemm ::compute () "lapackInfo = " << lapackInfo << " which indicates an error in the matrix inverse GETRI."); } - for (local_ordinal_type j = 0; j < NumU; ++j) { - little_block_type currentVal((typename little_block_type::value_type*) &InV[(NumL+1+j)*blockMatSize], blockSize_, rowStride); // current_mults++; + for (LO j = 0; j < NumU; ++j) { + little_block_host_type currentVal((typename little_block_host_type::value_type*) &InV[(NumL+1+j)*blockMatSize], blockSize_, rowStride); // current_mults++; // scale U by the diagonal inverse #ifndef IFPACK2_RBILUK_INITIAL_NOKK KokkosBatched::Experimental::SerialGemm @@ -762,6 +774,7 @@ void RBILUK::compute () } // Stop timing // Sync everything back to device, for efficient solves. + /* { typedef typename block_crs_matrix_type::device_type device_type; if (! A_block_.is_null ()) { @@ -773,6 +786,7 @@ void RBILUK::compute () U_block_->template sync (); D_block_->template sync (); } + */ this->isComputed_ = true; this->numCompute_ += 1; @@ -814,15 +828,15 @@ apply (const Tpetra::MultiVectorgetGraph ()->getDomainMap ()), blockSize_); const BMV xBlock (X, * (A_block_->getColMap ()), blockSize_); Teuchos::Array lclarray(blockSize_); - little_vec_type lclvec((typename little_vec_type::value_type*)&lclarray[0], blockSize_); + little_host_vec_type lclvec((typename little_host_vec_type::value_type*)&lclarray[0], blockSize_); const scalar_type one = STM::one (); const scalar_type zero = STM::zero (); @@ -838,14 +852,14 @@ apply (const Tpetra::MultiVectorgetGraph ()->getDomainMap ()), blockSize_, numVectors); BMV rBlock (* (A_block_->getGraph ()->getDomainMap ()), blockSize_, numVectors); - for (local_ordinal_type imv = 0; imv < numVectors; ++imv) + for (LO imv = 0; imv < numVectors; ++imv) { for (size_t i = 0; i < D_block_->getNodeNumRows(); ++i) { - local_ordinal_type local_row = i; + LO local_row = i; const_host_little_vec_type xval = xBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::ReadOnly); little_host_vec_type cval = @@ -853,20 +867,19 @@ apply (const Tpetra::MultiVectorgetLocalRowView(local_row, colValsL, valsL); + LO NumL = (LO) colValsL.size(); - L_block_->getLocalRowView(local_row, colValsL, valsL, NumL); - - for (local_ordinal_type j = 0; j < NumL; ++j) + for (LO j = 0; j < NumL; ++j) { - local_ordinal_type col = colValsL[j]; + LO col = colValsL[j]; const_host_little_vec_type prevVal = cBlock.getLocalBlockHost(col, imv, Tpetra::Access::ReadOnly); - const local_ordinal_type matOffset = blockMatSize*j; - little_block_type lij((typename little_block_type::value_type*) &valsL[matOffset],blockSize_,rowStride); + const LO matOffset = blockMatSize*j; + little_block_host_type lij((typename little_block_host_type::value_type*) &valsL[matOffset],blockSize_,rowStride); //cval.matvecUpdate(-one, lij, prevVal); Tpetra::GEMV (-one, lij, prevVal, cval); @@ -878,12 +891,12 @@ apply (const Tpetra::MultiVectorapplyBlock(cBlock, rBlock); // Solve U Y = R. - for (local_ordinal_type imv = 0; imv < numVectors; ++imv) + for (LO imv = 0; imv < numVectors; ++imv) { - const local_ordinal_type numRows = D_block_->getNodeNumRows(); - for (local_ordinal_type i = 0; i < numRows; ++i) + const LO numRows = D_block_->getNodeNumRows(); + for (LO i = 0; i < numRows; ++i) { - local_ordinal_type local_row = (numRows-1)-i; + LO local_row = (numRows-1)-i; const_host_little_vec_type rval = rBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::ReadOnly); little_host_vec_type yval = @@ -891,20 +904,19 @@ apply (const Tpetra::MultiVectorgetLocalRowView(local_row, colValsU, valsU, NumU); + local_inds_host_view_type colValsU; + values_host_view_type valsU; + U_block_->getLocalRowView(local_row, colValsU, valsU); + LO NumU = (LO) colValsU.size(); - for (local_ordinal_type j = 0; j < NumU; ++j) + for (LO j = 0; j < NumU; ++j) { - local_ordinal_type col = colValsU[NumU-1-j]; + LO col = colValsU[NumU-1-j]; const_host_little_vec_type prevVal = yBlock.getLocalBlockHost(col, imv, Tpetra::Access::ReadOnly); - const local_ordinal_type matOffset = blockMatSize*(NumU-1-j); - little_block_type uij((typename little_block_type::value_type*) &valsU[matOffset], blockSize_, rowStride); + const LO matOffset = blockMatSize*(NumU-1-j); + little_block_host_type uij((typename little_block_host_type::value_type*) &valsU[matOffset], blockSize_, rowStride); //yval.matvecUpdate(-one, uij, prevVal); Tpetra::GEMV (-one, uij, prevVal, yval); diff --git a/packages/ifpack2/src/Ifpack2_ILUT_decl.hpp b/packages/ifpack2/src/Ifpack2_ILUT_decl.hpp index 553d70478043..400687d68d34 100644 --- a/packages/ifpack2/src/Ifpack2_ILUT_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_ILUT_decl.hpp @@ -123,6 +123,14 @@ class ILUT : global_ordinal_type, node_type> row_matrix_type; + typedef typename row_matrix_type::global_inds_host_view_type global_inds_host_view_type; + typedef typename row_matrix_type::local_inds_host_view_type local_inds_host_view_type; + typedef typename row_matrix_type::values_host_view_type values_host_view_type; + + typedef typename row_matrix_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename row_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename row_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type; + static_assert(std::is_same::value, "Ifpack2::ILUT: The template parameter MatrixType must be a Tpetra::RowMatrix specialization. Please don't use Tpetra::CrsMatrix (a subclass of Tpetra::RowMatrix) here anymore. The constructor can take either a RowMatrix or a CrsMatrix just fine."); //! Type of the Tpetra::CrsMatrix specialization that this class uses for the L and U factors. diff --git a/packages/ifpack2/src/Ifpack2_ILUT_def.hpp b/packages/ifpack2/src/Ifpack2_ILUT_def.hpp index 6bbab2d5beae..63d962a5b822 100644 --- a/packages/ifpack2/src/Ifpack2_ILUT_def.hpp +++ b/packages/ifpack2/src/Ifpack2_ILUT_def.hpp @@ -491,18 +491,17 @@ void ILUT::compute () // =================== // // start factorization // // =================== // - - ArrayRCP ColIndicesARCP; - ArrayRCP ColValuesARCP; + nonconst_local_inds_host_view_type ColIndicesARCP; + nonconst_values_host_view_type ColValuesARCP; if (! A_local_->supportsRowViews ()) { const size_t maxnz = A_local_->getNodeMaxNumRowEntries (); - ColIndicesARCP.resize (maxnz); - ColValuesARCP.resize (maxnz); + Kokkos::resize(ColIndicesARCP,maxnz); + Kokkos::resize(ColValuesARCP,maxnz); } for (local_ordinal_type row_i = 0 ; row_i < myNumRows ; ++row_i) { - ArrayView ColIndicesA; - ArrayView ColValuesA; + local_inds_host_view_type ColIndicesA; + values_host_view_type ColValuesA; size_t RowNnz; if (A_local_->supportsRowViews ()) { @@ -510,9 +509,9 @@ void ILUT::compute () RowNnz = ColIndicesA.size (); } else { - A_local_->getLocalRowCopy (row_i, ColIndicesARCP (), ColValuesARCP (), RowNnz); - ColIndicesA = ColIndicesARCP (0, RowNnz); - ColValuesA = ColValuesARCP (0, RowNnz); + A_local_->getLocalRowCopy (row_i, ColIndicesARCP, ColValuesARCP, RowNnz); + ColIndicesA = Kokkos::subview(ColIndicesARCP,std::make_pair((size_t)0, RowNnz)); + ColValuesA = Kokkos::subview(ColValuesARCP,std::make_pair((size_t)0, RowNnz)); } // Always include the diagonal in the U factor. The value should get @@ -612,7 +611,7 @@ void ILUT::compute () // Put indices and values for L into arrays and then into the L_ matrix. // first, the original entries from the L section of A: - for (size_type i = 0; i < ColIndicesA.size (); ++i) { + for (size_type i = 0; i < (size_type)ColIndicesA.size (); ++i) { if (ColIndicesA[i] < row_i) { L_tmp_idx[row_i].push_back(ColIndicesA[i]); L_tmpv[row_i].push_back(cur_row[ColIndicesA[i]]); diff --git a/packages/ifpack2/src/Ifpack2_IlukGraph.hpp b/packages/ifpack2/src/Ifpack2_IlukGraph.hpp index 9242bb5c70b0..cac7ae4a167b 100644 --- a/packages/ifpack2/src/Ifpack2_IlukGraph.hpp +++ b/packages/ifpack2/src/Ifpack2_IlukGraph.hpp @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -111,6 +112,13 @@ class IlukGraph : public Teuchos::Describable { global_ordinal_type, node_type> crs_graph_type; + + + typedef typename crs_graph_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename crs_graph_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename crs_graph_type::global_inds_host_view_type global_inds_host_view_type; + typedef typename crs_graph_type::local_inds_host_view_type local_inds_host_view_type; + /// \brief Constructor. /// /// Create a IlukGraph object using the input graph and specified @@ -280,34 +288,39 @@ void IlukGraph::initialize() using device_type = typename node_type::device_type; using execution_space = typename device_type::execution_space; - Kokkos::DualView numEntPerRow("numEntPerRow", NumMyRows); - auto numEntPerRow_d = numEntPerRow.template view(); - auto localOverlapGraph = OverlapGraph_->getLocalGraph(); + using dual_view_type = Kokkos::DualView; + dual_view_type numEntPerRow_dv("numEntPerRow",NumMyRows); + Tpetra::Details::WrappedDualView numEntPerRow(numEntPerRow_dv); const auto overalloc = Overalloc_; const auto levelfill = LevelFill_; - numEntPerRow.sync_device(); - numEntPerRow.modify_device(); - Kokkos::parallel_for("CountOverlapGraphRowEntries", - Kokkos::RangePolicy(0, NumMyRows), - KOKKOS_LAMBDA(const int i) - { - // Heuristic to get the maximum number of entries per row. - int RowMaxNumIndices = localOverlapGraph.rowConst(i).length; - numEntPerRow_d(i) = (levelfill == 0) ? RowMaxNumIndices // No additional storage needed - : ceil(static_cast(RowMaxNumIndices) - * pow(overalloc, levelfill)); - }); + { + // Scoping for the localOverlapGraph access + auto numEntPerRow_d = numEntPerRow.getDeviceView(Tpetra::Access::OverwriteAll); + auto localOverlapGraph = OverlapGraph_->getLocalGraphDevice(); + Kokkos::parallel_for("CountOverlapGraphRowEntries", + Kokkos::RangePolicy(0, NumMyRows), + KOKKOS_LAMBDA(const int i) + { + // Heuristic to get the maximum number of entries per row. + int RowMaxNumIndices = localOverlapGraph.rowConst(i).length; + numEntPerRow_d(i) = (levelfill == 0) ? RowMaxNumIndices // No additional storage needed + : ceil(static_cast(RowMaxNumIndices) + * pow(overalloc, levelfill)); + }); + + }; bool insertError; // No error found yet while inserting entries do { insertError = false; + Teuchos::ArrayView a_numEntPerRow(numEntPerRow.getHostView(Tpetra::Access::ReadOnly).data(),NumMyRows); L_Graph_ = rcp (new crs_graph_type (OverlapGraph_->getRowMap (), OverlapGraph_->getRowMap (), - numEntPerRow)); + a_numEntPerRow)); U_Graph_ = rcp (new crs_graph_type (OverlapGraph_->getRowMap (), OverlapGraph_->getRowMap (), - numEntPerRow)); + a_numEntPerRow)); Array L (MaxNumIndices); Array U (MaxNumIndices); @@ -317,7 +330,7 @@ void IlukGraph::initialize() NumMyDiagonals_ = 0; for (int i = 0; i< NumMyRows; ++i) { - ArrayView my_indices; + local_inds_host_view_type my_indices; OverlapGraph_->getLocalRowView (i, my_indices); // Split into L and U (we don't assume that indices are ordered). @@ -352,12 +365,10 @@ void IlukGraph::initialize() ++NumMyDiagonals_; } if (NumL) { - ArrayView L_view = L.view (0, NumL); - L_Graph_->insertLocalIndices (i, L_view); + L_Graph_->insertLocalIndices (i, NumL, L.data()); } if (NumU) { - ArrayView U_view = U.view (0, NumU); - U_Graph_->insertLocalIndices (i, U_view); + U_Graph_->insertLocalIndices (i, NumU, U.data()); } } @@ -394,16 +405,16 @@ void IlukGraph::initialize() size_t LenL = L_Graph_->getNumEntriesInLocalRow(i); size_t LenU = U_Graph_->getNumEntriesInLocalRow(i); size_t Len = LenL + LenU + 1; - CurrentRow.resize(Len); - - L_Graph_->getLocalRowCopy(i, CurrentRow(), LenL); // Get L Indices + nonconst_local_inds_host_view_type CurrentRow_view(CurrentRow.data(),CurrentRow.size()); + L_Graph_->getLocalRowCopy(i, CurrentRow_view, LenL); // Get L Indices CurrentRow[LenL] = i; // Put in Diagonal if (LenU > 0) { - ArrayView URowView = CurrentRow.view (LenL+1, - LenU); + ArrayView URowView = CurrentRow.view (LenL+1,LenU); + nonconst_local_inds_host_view_type URowView_v(URowView.data(),URowView.size()); + // Get U Indices - U_Graph_->getLocalRowCopy (i, URowView, LenU); + U_Graph_->getLocalRowCopy (i, URowView_v, LenU); } // Construct linked list for current row @@ -425,7 +436,7 @@ void IlukGraph::initialize() int NextInList = LinkList[Next]; int RowU = Next; // Get Indices for this row of U - ArrayView IndicesU; + local_inds_host_view_type IndicesU; U_Graph_->getLocalRowView (RowU, IndicesU); // FIXME (mfh 23 Dec 2013) size() returns ptrdiff_t, not int. int LengthRowU = IndicesU.size (); @@ -464,15 +475,13 @@ void IlukGraph::initialize() } // Put pattern into L and U - - CurrentRow.resize (0); + CurrentRow.resize(0); Next = First; // Lower - while (Next < i) { - CurrentRow.push_back (Next); + CurrentRow.push_back(Next); Next = LinkList[Next]; } @@ -481,7 +490,7 @@ void IlukGraph::initialize() // particular, it does not actually change the column Map. L_Graph_->removeLocalIndices (i); // Delete current set of Indices if (CurrentRow.size() > 0) { - L_Graph_->insertLocalIndices (i, CurrentRow ()); + L_Graph_->insertLocalIndices (i, CurrentRow.size(),CurrentRow.data()); } // Diagonal @@ -494,8 +503,7 @@ void IlukGraph::initialize() Next = LinkList[Next]; // Upper - - CurrentRow.resize (0); + CurrentRow.resize(0); LenU = 0; while (Next < NumMyRows) { @@ -511,7 +519,7 @@ void IlukGraph::initialize() U_Graph_->removeLocalIndices (i); // Delete current set of Indices if (LenU > 0) { - U_Graph_->insertLocalIndices (i, CurrentRow ()); + U_Graph_->insertLocalIndices (i, CurrentRow.size(),CurrentRow.data()); } // Allocate and fill Level info for this row @@ -523,8 +531,7 @@ void IlukGraph::initialize() } catch (std::runtime_error &e) { insertError = true; - numEntPerRow.sync_device(); - numEntPerRow.modify_device(); + auto numEntPerRow_d = numEntPerRow.getDeviceView(Tpetra::Access::OverwriteAll); Kokkos::parallel_for("CountOverlapGraphRowEntries", Kokkos::RangePolicy(0, NumMyRows), KOKKOS_LAMBDA(const int i) @@ -564,11 +571,11 @@ void IlukGraph::initialize(const Teuchos::RCP lno_row_view_t; typedef typename Kokkos::View lno_nonzero_view_t; @@ -578,7 +585,7 @@ void IlukGraph::initialize(const Teuchos::RCPgetRowMap()->getNodeNumElements(); - auto localOverlapGraph = OverlapGraph_->getLocalGraph(); + auto localOverlapGraph = OverlapGraph_->getLocalGraphDevice(); if (KernelHandle->get_spiluk_handle()->get_nrows() < static_cast(NumMyRows)) { KernelHandle->get_spiluk_handle()->reset_handle(NumMyRows, diff --git a/packages/ifpack2/src/Ifpack2_LinePartitioner_decl.hpp b/packages/ifpack2/src/Ifpack2_LinePartitioner_decl.hpp index bb882a8a810b..1678526e5391 100644 --- a/packages/ifpack2/src/Ifpack2_LinePartitioner_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_LinePartitioner_decl.hpp @@ -82,6 +82,8 @@ class LinePartitioner : public OverlappingPartitioner { typedef Tpetra::RowGraph row_graph_type; typedef Tpetra::MultiVector multivector_type; + typedef typename row_graph_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename row_graph_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; //! Constructor. LinePartitioner(const Teuchos::RCP& graph); diff --git a/packages/ifpack2/src/Ifpack2_LinePartitioner_def.hpp b/packages/ifpack2/src/Ifpack2_LinePartitioner_def.hpp index 7bf97cc206e9..74fa0f10b927 100644 --- a/packages/ifpack2/src/Ifpack2_LinePartitioner_def.hpp +++ b/packages/ifpack2/src/Ifpack2_LinePartitioner_def.hpp @@ -122,7 +122,7 @@ int LinePartitioner::Compute_Blocks_AutoLine(Teuchos::ArrayVie size_t N = this->Graph_->getNodeNumRows(); size_t allocated_space = this->Graph_->getNodeMaxNumRowEntries(); - Teuchos::Array cols(allocated_space); + nonconst_local_inds_host_view_type cols("cols",allocated_space); Teuchos::Array indices(allocated_space); Teuchos::Array dist(allocated_space); @@ -137,7 +137,7 @@ int LinePartitioner::Compute_Blocks_AutoLine(Teuchos::ArrayVie if(blockIndices[i] != invalid) continue; // Get neighbors and sort by distance - this->Graph_->getLocalRowCopy(i,cols(),nz); + this->Graph_->getLocalRowCopy(i,cols,nz); double x0 = (!xvals.is_null()) ? xvals[i/NumEqns_] : zero; double y0 = (!yvals.is_null()) ? yvals[i/NumEqns_] : zero; double z0 = (!zvals.is_null()) ? zvals[i/NumEqns_] : zero; @@ -190,7 +190,8 @@ void LinePartitioner::local_automatic_line_search(int NumEqns, size_t N = this->Graph_->getNodeNumRows(); size_t allocated_space = this->Graph_->getNodeMaxNumRowEntries(); - Teuchos::ArrayView cols = itemp(); + + nonconst_local_inds_host_view_type cols(itemp.data(),allocated_space); Teuchos::ArrayView indices = itemp.view(allocated_space,allocated_space); Teuchos::ArrayView dist= dtemp(); @@ -199,7 +200,7 @@ void LinePartitioner::local_automatic_line_search(int NumEqns, size_t nz=0; LO neighbors_in_line=0; - this->Graph_->getLocalRowCopy(next,cols(),nz); + this->Graph_->getLocalRowCopy(next,cols,nz); double x0 = (!xvals.is_null()) ? xvals[next/NumEqns_] : zero; double y0 = (!yvals.is_null()) ? yvals[next/NumEqns_] : zero; double z0 = (!zvals.is_null()) ? zvals[next/NumEqns_] : zero; diff --git a/packages/ifpack2/src/Ifpack2_LocalFilter_decl.hpp b/packages/ifpack2/src/Ifpack2_LocalFilter_decl.hpp index 4b6bf895def3..444f0d12cb43 100644 --- a/packages/ifpack2/src/Ifpack2_LocalFilter_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_LocalFilter_decl.hpp @@ -189,9 +189,21 @@ class LocalFilter : //! The Node type used by the input MatrixType. typedef typename MatrixType::node_type node_type; + + typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type; + typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type; + typedef typename MatrixType::values_host_view_type values_host_view_type; + + typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type; + + //! The type of the magnitude (absolute value) of a matrix entry. typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; + + //! Type of the Tpetra::RowMatrix specialization that this class uses. typedef Tpetra::RowMatrixTeuchos::OrdinalTraits::invalid() /// on output. + virtual void + getGlobalRowCopy (global_ordinal_type GlobalRow, + nonconst_global_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowCopy (global_ordinal_type GlobalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const; +#endif /// \brief Get the entries in the given row, using local indices. /// @@ -348,12 +367,19 @@ class LocalFilter : /// process, then \c Indices and \c Values are unchanged and /// \c NumIndices is Teuchos::OrdinalTraits::invalid() /// on output. + virtual void + getLocalRowCopy (local_ordinal_type LocalRow, + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE + virtual void getLocalRowCopy (local_ordinal_type LocalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const ; - +#endif //! Extract a const, non-persisting view of global indices in a specified row of the matrix. /*! \param GlobalRow [in] Global row number for which indices are desired. @@ -365,10 +391,16 @@ class LocalFilter : Note: If \c GlobalRow does not belong to this node, then \c indices is set to null. */ + virtual void + getGlobalRowView (global_ordinal_type GlobalRow, + global_inds_host_view_type &indices, + values_host_view_type &values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowView (global_ordinal_type GlobalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; +#endif //! Extract a const, non-persisting view of local indices in a specified row of the matrix. /*! @@ -381,11 +413,16 @@ class LocalFilter : Note: If \c LocalRow does not belong to this node, then \c indices is set to null. */ + virtual void + getLocalRowView (local_ordinal_type LocalRow, + local_inds_host_view_type & indices, + values_host_view_type & values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowView (local_ordinal_type LocalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; - +#endif /// \brief Get the diagonal entries of the (locally filtered) matrix. /// /// \param diag [in/out] On input: a Tpetra::Vector whose Map is the @@ -508,11 +545,12 @@ class LocalFilter : //! NumEntries_[i] contains the nonzero entries in row `i'. std::vector NumEntries_; - //! Temporary array used in getLocalRowCopy(). - mutable Teuchos::Array localIndices_; + //! Used in ExtractMyRowCopy, to avoid allocation each time. + mutable nonconst_local_inds_host_view_type localIndices_; + mutable nonconst_local_inds_host_view_type localIndicesForGlobalCopy_; + //! Used in ExtractMyRowCopy, to avoid allocation each time. + mutable nonconst_values_host_view_type Values_; - //! Temporary array used in getLocalRowCopy(). - mutable Teuchos::Array Values_; };// class LocalFilter }// namespace Ifpack2 diff --git a/packages/ifpack2/src/Ifpack2_LocalFilter_def.hpp b/packages/ifpack2/src/Ifpack2_LocalFilter_def.hpp index 5cd4b100cca7..edadd5e192c1 100644 --- a/packages/ifpack2/src/Ifpack2_LocalFilter_def.hpp +++ b/packages/ifpack2/src/Ifpack2_LocalFilter_def.hpp @@ -177,8 +177,9 @@ LocalFilter (const Teuchos::RCP& A) : MaxNumEntriesA_ = A_->getNodeMaxNumRowEntries (); // Allocate temporary arrays for getLocalRowCopy(). - localIndices_.resize (MaxNumEntries_); - Values_.resize (MaxNumEntries_); + Kokkos::resize(localIndices_,MaxNumEntries_); + Kokkos::resize(localIndicesForGlobalCopy_,MaxNumEntries_); + Kokkos::resize(Values_,MaxNumEntries_); // now compute: // - the number of nonzero per row @@ -427,10 +428,10 @@ bool LocalFilter::isFillComplete () const template void LocalFilter:: -getGlobalRowCopy (global_ordinal_type globalRow, - const Teuchos::ArrayView& globalIndices, - const Teuchos::ArrayView& values, - size_t& numEntries) const + getGlobalRowCopy (global_ordinal_type globalRow, + nonconst_global_inds_host_view_type &globalIndices, + nonconst_values_host_view_type &values, + size_t& numEntries) const { typedef local_ordinal_type LO; typedef typename Teuchos::Array::size_type size_type; @@ -452,29 +453,44 @@ getGlobalRowCopy (global_ordinal_type globalRow, // FIXME (mfh 26 Mar 2014) If local_ordinal_type == // global_ordinal_type, we could just alias the input array // instead of allocating a temporary array. - Teuchos::Array localIndices (numEntries); - this->getLocalRowCopy (localRow, localIndices (), values, numEntries); + + // In this case, getLocalRowCopy *does* use the localIndices_, so we use a second temp array + this->getLocalRowCopy (localRow, localIndicesForGlobalCopy_, values, numEntries); const map_type& colMap = * (this->getColMap ()); // Don't fill the output array beyond its size. const size_type numEnt = std::min (static_cast (numEntries), - std::min (globalIndices.size (), values.size ())); + std::min ((size_type)globalIndices.size (), (size_type)values.size ())); for (size_type k = 0; k < numEnt; ++k) { - globalIndices[k] = colMap.getGlobalElement (localIndices[k]); + globalIndices[k] = colMap.getGlobalElement (localIndicesForGlobalCopy_[k]); } } } +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void +LocalFilter:: +getGlobalRowCopy (global_ordinal_type globalRow, + const Teuchos::ArrayView& Indices, + const Teuchos::ArrayView& Values, + size_t& numEntries) const { + using IST = typename row_matrix_type::impl_scalar_type; + nonconst_global_inds_host_view_type ind_in(Indices.data(),Indices.size()); + nonconst_values_host_view_type val_in(reinterpret_cast(Values.data()),Values.size()); + getGlobalRowCopy(globalRow,ind_in,val_in,numEntries); +} +#endif template void LocalFilter:: getLocalRowCopy (local_ordinal_type LocalRow, - const Teuchos::ArrayView &Indices, - const Teuchos::ArrayView &Values, - size_t &NumEntries) const + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const { typedef local_ordinal_type LO; typedef global_ordinal_type GO; @@ -486,7 +502,7 @@ getLocalRowCopy (local_ordinal_type LocalRow, } if (A_->getRowMap()->getComm()->getSize() == 1) { - A_->getLocalRowCopy (LocalRow, Indices (), Values (), NumEntries); + A_->getLocalRowCopy (LocalRow, Indices, Values, NumEntries); return; } @@ -524,7 +540,7 @@ getLocalRowCopy (local_ordinal_type LocalRow, // column indices. CrsMatrix could take a set of column indices, // and return their corresponding values. size_t numEntInMat = 0; - A_->getLocalRowCopy (LocalRow, localIndices_ (), Values_ (), numEntInMat); + A_->getLocalRowCopy (LocalRow, localIndices_, Values_ , numEntInMat); // Fill the user's arrays with the "local" indices and values in // that row. Note that the matrix might have a different column Map @@ -573,7 +589,35 @@ getLocalRowCopy (local_ordinal_type LocalRow, } } +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void +LocalFilter:: +getLocalRowCopy (local_ordinal_type globalRow, + const Teuchos::ArrayView &Indices, + const Teuchos::ArrayView &Values, + size_t &NumEntries) const +{ + using IST = typename row_matrix_type::impl_scalar_type; + nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size()); + nonconst_values_host_view_type val_in(reinterpret_cast(Values.data()),Values.size()); + getLocalRowCopy(globalRow,ind_in,val_in,NumEntries); +} +#endif + +template +void +LocalFilter:: +getGlobalRowView (global_ordinal_type /*GlobalRow*/, + global_inds_host_view_type &/*indices*/, + values_host_view_type &/*values*/) const +{ + TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, + "Ifpack2::LocalFilter does not implement getGlobalRowView."); +} + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void LocalFilter:: @@ -584,8 +628,21 @@ getGlobalRowView (global_ordinal_type /* GlobalRow */, TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "Ifpack2::LocalFilter does not implement getGlobalRowView."); } +#endif + +template +void +LocalFilter:: +getLocalRowView (local_ordinal_type /*LocalRow*/, + local_inds_host_view_type &/*indices*/, + values_host_view_type &/*values*/) const +{ + TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, + "Ifpack2::LocalFilter does not implement getLocalRowView."); +} +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void LocalFilter:: @@ -596,6 +653,7 @@ getLocalRowView (local_ordinal_type /* LocalRow */, TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "Ifpack2::LocalFilter does not implement getLocalRowView."); } +#endif template @@ -738,13 +796,14 @@ applyNonAliased (const Tpetra::MultiVector(Values_.data()); if (mode == Teuchos::NO_TRANS) { for (size_t j = 0; j < Nnz; ++j) { const local_ordinal_type col = localIndices_[j]; for (size_t k = 0; k < NumVectors; ++k) { y_ptr[i + y_stride*k] += - alpha * Values_[j] * x_ptr[col + x_stride*k]; + alpha * Values[j] * x_ptr[col + x_stride*k]; } } } @@ -753,7 +812,7 @@ applyNonAliased (const Tpetra::MultiVector(Values_.data()); if (mode == Teuchos::NO_TRANS) { for (size_t k = 0; k < NumVectors; ++k) { ArrayView x_local = (x_ptr())[k](); ArrayView y_local = (y_ptr())[k](); for (size_t j = 0; j < Nnz; ++j) { - y_local[i] += alpha * Values_[j] * x_local[localIndices_[j]]; + y_local[i] += alpha * Values[j] * x_local[localIndices_[j]]; } } } @@ -792,7 +852,7 @@ applyNonAliased (const Tpetra::MultiVector x_local = (x_ptr())[k](); ArrayView y_local = (y_ptr())[k](); for (size_t j = 0; j < Nnz; ++j) { - y_local[localIndices_[j]] += alpha * Values_[j] * x_local[i]; + y_local[localIndices_[j]] += alpha * Values[j] * x_local[i]; } } } @@ -802,7 +862,7 @@ applyNonAliased (const Tpetra::MultiVector y_local = (y_ptr())[k](); for (size_t j = 0; j < Nnz; ++j) { y_local[localIndices_[j]] += - alpha * STS::conjugate (Values_[j]) * x_local[i]; + alpha * STS::conjugate (Values[j]) * x_local[i]; } } } @@ -839,15 +899,15 @@ LocalFilter::getFrobeniusNorm () const typedef typename Teuchos::Array::size_type size_type; const size_type maxNumRowEnt = getNodeMaxNumRowEntries (); - Teuchos::Array ind (maxNumRowEnt); - Teuchos::Array val (maxNumRowEnt); + nonconst_local_inds_host_view_type ind ("ind",maxNumRowEnt); + nonconst_values_host_view_type val ("val",maxNumRowEnt); const size_t numRows = static_cast (localRowMap_->getNodeNumElements ()); // FIXME (mfh 03 Apr 2013) Scale during sum to avoid overflow. mag_type sumSquared = STM::zero (); for (size_t i = 0; i < numRows; ++i) { size_t numEntries = 0; - this->getLocalRowCopy (i, ind (), val (), numEntries); + this->getLocalRowCopy (i, ind, val, numEntries); for (size_type k = 0; k < static_cast (numEntries); ++k) { const mag_type v_k_abs = STS::magnitude (val[k]); sumSquared += v_k_abs * v_k_abs; diff --git a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_decl.hpp b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_decl.hpp index 0cab7ef2e6d6..22e4c368c8db 100644 --- a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_decl.hpp @@ -115,14 +115,14 @@ class LocalSparseTriangularSolver : "either a RowMatrix or a CrsMatrix just fine."); // Use the local matrix types - using local_matrix_type = typename crs_matrix_type::local_matrix_type; - using local_matrix_graph_type = typename local_matrix_type::StaticCrsGraphType; - using lno_row_view_t = typename local_matrix_graph_type::row_map_type; - using lno_nonzero_view_t = typename local_matrix_graph_type::entries_type; - using scalar_nonzero_view_t = typename local_matrix_type::values_type; - using TemporaryMemorySpace = typename local_matrix_graph_type::device_type::memory_space; - using PersistentMemorySpace = typename local_matrix_graph_type::device_type::memory_space; - using HandleExecSpace = typename local_matrix_graph_type::device_type::execution_space; + using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type; + using local_matrix_graph_device_type = typename local_matrix_device_type::StaticCrsGraphType; + using lno_row_view_t = typename local_matrix_graph_device_type::row_map_type; + using lno_nonzero_view_t = typename local_matrix_graph_device_type::entries_type; + using scalar_nonzero_view_t = typename local_matrix_device_type::values_type; + using TemporaryMemorySpace = typename local_matrix_graph_device_type::device_type::memory_space; + using PersistentMemorySpace = typename local_matrix_graph_device_type::device_type::memory_space; + using HandleExecSpace = typename local_matrix_graph_device_type::device_type::execution_space; using k_handle = typename KokkosKernels::Experimental::KokkosKernelsHandle; /// \brief Constructor diff --git a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp index 8851131e6236..023364dfa677 100644 --- a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp +++ b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp @@ -377,7 +377,7 @@ initialize () using Tpetra::Details::determineLocalTriangularStructure; using crs_matrix_type = Tpetra::CrsMatrix; - using local_matrix_type = typename crs_matrix_type::local_matrix_type; + using local_matrix_type = typename crs_matrix_type::local_matrix_device_type; using LO = local_ordinal_type; const char prefix[] = "Ifpack2::LocalSparseTriangularSolver::initialize: "; @@ -411,7 +411,7 @@ initialize () // mfh 30 Apr 2018: See GitHub Issue #2658. constexpr bool ignoreMapsForTriStructure = true; auto lclTriStructure = [&] { - auto lclMatrix = A_crs_->getLocalMatrix (); + auto lclMatrix = A_crs_->getLocalMatrixDevice (); auto lclRowMap = A_crs_->getRowMap ()->getLocalMap (); auto lclColMap = A_crs_->getColMap ()->getLocalMap (); auto lclTriStruct = @@ -429,7 +429,7 @@ initialize () if (reverseStorage_ && lclTriStructure.couldBeUpperTriangular && htsImpl_.is_null ()) { // Reverse the storage for an upper triangular matrix - auto Alocal = A_crs_->getLocalMatrix(); + auto Alocal = A_crs_->getLocalMatrixDevice(); auto ptr = Alocal.graph.row_map; auto ind = Alocal.graph.entries; auto val = Alocal.values; @@ -563,7 +563,7 @@ compute () if (Teuchos::nonnull(kh_) && this->isKokkosKernelsSptrsv_) { auto A_crs = Teuchos::rcp_dynamic_cast (A_); - auto Alocal = A_crs->getLocalMatrix(); + auto Alocal = A_crs->getLocalMatrixDevice(); auto ptr = Alocal.graph.row_map; auto ind = Alocal.graph.entries; auto val = Alocal.values; @@ -731,7 +731,7 @@ localTriangularSolve (const MV& Y, if (Teuchos::nonnull(kh_) && this->isKokkosKernelsSptrsv_ && trans == "N") { auto A_crs = Teuchos::rcp_dynamic_cast (this->A_); - auto A_lclk = A_crs->getLocalMatrix (); + auto A_lclk = A_crs->getLocalMatrixDevice (); auto ptr = A_lclk.graph.row_map; auto ind = A_lclk.graph.entries; auto val = A_lclk.values; @@ -753,7 +753,11 @@ localTriangularSolve (const MV& Y, else { const std::string diag = this->diag_; - auto A_lcl = this->A_crs_->getLocalMatrix (); + // NOTE (mfh 20 Aug 2017): KokkosSparse::trsv currently is a + // sequential, host-only code. See + // https://github.com/kokkos/kokkos-kernels/issues/48. + + auto A_lcl = this->A_crs_->getLocalMatrixHost (); if (X.isConstantStride () && Y.isConstantStride ()) { auto X_lcl = X.getLocalViewHost (Tpetra::Access::ReadWrite); diff --git a/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_decl.hpp b/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_decl.hpp index 1b8dcba6995a..6578a7256bc4 100644 --- a/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_decl.hpp @@ -80,6 +80,9 @@ class OverlappingPartitioner : public Partitioner { typedef typename GraphType::local_ordinal_type local_ordinal_type; typedef typename GraphType::global_ordinal_type global_ordinal_type; typedef typename GraphType::node_type node_type; + typedef typename GraphType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename GraphType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef Tpetra::RowGraph row_graph_type; //! Constructor. diff --git a/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_def.hpp b/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_def.hpp index 058097930b33..1d732ccf911f 100644 --- a/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_def.hpp +++ b/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_def.hpp @@ -309,10 +309,8 @@ void OverlappingPartitioner::computeOverlappingPartitions() // of row `i'. int MaxNumEntries_tmp = Graph_->getNodeMaxNumRowEntries(); - Teuchos::Array Indices; - Indices.resize (MaxNumEntries_tmp); - Teuchos::Array newIndices; - newIndices.resize(MaxNumEntries_tmp); + nonconst_local_inds_host_view_type Indices("Indices",MaxNumEntries_tmp); + nonconst_local_inds_host_view_type newIndices("newIndices",MaxNumEntries_tmp); if (!maintainSparsity_) { @@ -322,7 +320,7 @@ void OverlappingPartitioner::computeOverlappingPartitions() const local_ordinal_type LRID = Parts_[part][i]; size_t numIndices; - Graph_->getLocalRowCopy (LRID, Indices (), numIndices); + Graph_->getLocalRowCopy (LRID, Indices, numIndices); for (size_t j = 0; j < numIndices; ++j) { // use *local* indices only @@ -366,12 +364,12 @@ void OverlappingPartitioner::computeOverlappingPartitions() const local_ordinal_type LRID = Parts_[part][i]; size_t numIndices; - Graph_->getLocalRowCopy (LRID, Indices (), numIndices); + Graph_->getLocalRowCopy (LRID, Indices, numIndices); //JJH: the entries in Indices are already sorted. However, the Tpetra documentation states // that we can't count on this always being true, hence we sort. Also note that there are // unused entries at the end of Indices (it's sized to hold any row). This means we can't // just use Indices.end() in sorting and in std::includes - std::sort(Indices.begin(),Indices.begin()+numIndices); + Tpetra::sort(Indices,numIndices); for (size_t j = 0; j < numIndices; ++j) { // use *local* indices only @@ -389,10 +387,12 @@ void OverlappingPartitioner::computeOverlappingPartitions() // Check if row associated with "col" increases connectivity already defined by row LRID's stencil. // If it does and maintainSparsity_ is true, do not add "col" to the current partition (block). size_t numNewIndices; - Graph_->getLocalRowCopy(col, newIndices(), numNewIndices); - std::sort(newIndices.begin(),newIndices.begin()+numNewIndices); - bool isSubset = std::includes(Indices.begin(),Indices.begin()+numIndices, - newIndices.begin(),newIndices.begin()+numNewIndices); + Graph_->getLocalRowCopy(col, newIndices, numNewIndices); + Tpetra::sort(newIndices,numNewIndices); + auto Indices_rcp = Kokkos::Compat::persistingView(Indices, 0, numIndices); + auto newIndices_rcp = Kokkos::Compat::persistingView(newIndices, 0, numNewIndices); + bool isSubset = std::includes(Indices_rcp.begin(),Indices_rcp.begin()+numIndices, + newIndices_rcp.begin(),newIndices_rcp.begin()+numNewIndices); if (isSubset) { tmp[part].push_back (col); } diff --git a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_decl.hpp b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_decl.hpp index 35c28396cd9f..552a5967e96e 100644 --- a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_decl.hpp @@ -65,6 +65,14 @@ class OverlappingRowMatrix : typedef typename MatrixType::global_ordinal_type global_ordinal_type; typedef typename MatrixType::node_type node_type; typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; + typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type; + typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type; + typedef typename MatrixType::values_host_view_type values_host_view_type; + + typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type; + using row_matrix_type = Tpetra::RowMatrix; @@ -207,12 +215,18 @@ class OverlappingRowMatrix : with row \c GlobalRow. If \c GlobalRow does not belong to this node, then \c Indices and \c Values are unchanged and \c NumIndices is returned as Teuchos::OrdinalTraits::invalid(). */ + virtual void + getGlobalRowCopy (global_ordinal_type GlobalRow, + nonconst_global_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowCopy (global_ordinal_type GlobalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const; - +#endif //! Extract a list of entries in a specified local row of the graph. Put into storage allocated by calling routine. /*! \param LocalRow - (In) Local row number for which indices are desired. @@ -224,11 +238,18 @@ class OverlappingRowMatrix : with row \c LocalRow. If \c LocalRow is not valid for this node, then \c Indices and \c Values are unchanged and \c NumIndices is returned as Teuchos::OrdinalTraits::invalid(). */ + virtual void + getLocalRowCopy (local_ordinal_type LocalRow, + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowCopy (local_ordinal_type LocalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const; +#endif //! Extract a const, non-persisting view of global indices in a specified row of the matrix. /*! @@ -240,11 +261,16 @@ class OverlappingRowMatrix : Note: If \c GlobalRow does not belong to this node, then \c indices is set to null. */ + virtual void + getGlobalRowView (global_ordinal_type GlobalRow, + global_inds_host_view_type &indices, + values_host_view_type &values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowView (global_ordinal_type GlobalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; - +#endif //! Extract a const, non-persisting view of local indices in a specified row of the matrix. /*! \param LocalRow - (In) Local row number for which indices are desired. @@ -255,10 +281,16 @@ class OverlappingRowMatrix : Note: If \c LocalRow does not belong to this node, then \c indices is set to null. */ + virtual void + getLocalRowView (local_ordinal_type LocalRow, + local_inds_host_view_type & indices, + values_host_view_type & values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowView (local_ordinal_type LocalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; +#endif //! \brief Get a copy of the diagonal entries owned by this node, with local row indices. /*! Returns a distributed Vector object partitioned according to this matrix's row map, containing the @@ -368,11 +400,11 @@ class OverlappingRowMatrix : //! Graph of the matrix (as returned by getGraph()). Teuchos::RCP graph_; - //! Used in apply(), to avoid allocation each time. - mutable Teuchos::Array Indices_; + mutable nonconst_local_inds_host_view_type Indices_; //! Used in apply(), to avoid allocation each time. - mutable Teuchos::Array Values_; + mutable nonconst_values_host_view_type Values_; + }; // class OverlappingRowMatrix diff --git a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp index 8e220719cbd3..772825300b76 100644 --- a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp +++ b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp @@ -223,8 +223,8 @@ OverlappingRowMatrix (const Teuchos::RCP& A, graph_ = Teuchos::rcp_const_cast (Teuchos::rcp_implicit_cast (graph)); // Resize temp arrays - Indices_.resize (MaxNumEntries_); - Values_.resize (MaxNumEntries_); + Kokkos::resize(Indices_,MaxNumEntries_); + Kokkos::resize(Values_,MaxNumEntries_); } @@ -412,10 +412,10 @@ bool OverlappingRowMatrix::isFillComplete() const template void OverlappingRowMatrix:: -getGlobalRowCopy (global_ordinal_type GlobalRow, - const Teuchos::ArrayView &Indices, - const Teuchos::ArrayView& Values, - size_t& NumEntries) const + getGlobalRowCopy (global_ordinal_type GlobalRow, + nonconst_global_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const { const local_ordinal_type LocalRow = RowMap_->getLocalElement (GlobalRow); if (LocalRow == Teuchos::OrdinalTraits::invalid ()) { @@ -429,14 +429,27 @@ getGlobalRowCopy (global_ordinal_type GlobalRow, } } +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void OverlappingRowMatrix:: +getGlobalRowCopy (global_ordinal_type GlobalRow, + const Teuchos::ArrayView &Indices, + const Teuchos::ArrayView &Values, + size_t &NumEntries) const { + using IST = typename row_matrix_type::impl_scalar_type; + nonconst_global_inds_host_view_type ind_in(Indices.data(),Indices.size()); + nonconst_values_host_view_type val_in(reinterpret_cast(Values.data()),Values.size()); + getGlobalRowCopy(GlobalRow,ind_in,val_in,NumEntries); +} +#endif template void OverlappingRowMatrix:: -getLocalRowCopy (local_ordinal_type LocalRow, - const Teuchos::ArrayView &Indices, - const Teuchos::ArrayView &Values, - size_t &NumEntries) const + getLocalRowCopy (local_ordinal_type LocalRow, + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const { using Teuchos::as; const size_t numMyRowsA = A_->getNodeNumRows (); @@ -448,7 +461,42 @@ getLocalRowCopy (local_ordinal_type LocalRow, } } +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void +OverlappingRowMatrix:: +getLocalRowCopy (local_ordinal_type LocalRow, + const Teuchos::ArrayView &Indices, + const Teuchos::ArrayView &Values, + size_t &NumEntries) const +{ + using IST = typename row_matrix_type::impl_scalar_type; + nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size()); + nonconst_values_host_view_type val_in(reinterpret_cast(Values.data()),Values.size()); + getLocalRowCopy(LocalRow,ind_in,val_in,NumEntries); +} +#endif +template +void +OverlappingRowMatrix:: +getGlobalRowView (global_ordinal_type GlobalRow, + global_inds_host_view_type &indices, + values_host_view_type &values) const { + const local_ordinal_type LocalRow = RowMap_->getLocalElement (GlobalRow); + if (LocalRow == Teuchos::OrdinalTraits::invalid()) { + indices = global_inds_host_view_type(); + values = values_host_view_type(); + } else { + if (Teuchos::as (LocalRow) < A_->getNodeNumRows ()) { + A_->getGlobalRowView (GlobalRow, indices, values); + } else { + ExtMatrix_->getGlobalRowView (GlobalRow, indices, values); + } + } +} + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void OverlappingRowMatrix:: @@ -468,8 +516,26 @@ getGlobalRowView (global_ordinal_type GlobalRow, } } } +#endif + +template +void +OverlappingRowMatrix:: + getLocalRowView (local_ordinal_type LocalRow, + local_inds_host_view_type & indices, + values_host_view_type & values) const { + using Teuchos::as; + const size_t numMyRowsA = A_->getNodeNumRows (); + if (as (LocalRow) < numMyRowsA) { + A_->getLocalRowView (LocalRow, indices, values); + } else { + ExtMatrix_->getLocalRowView (LocalRow - as (numMyRowsA), + indices, values); + } +} +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void OverlappingRowMatrix:: @@ -486,6 +552,7 @@ getLocalRowView (local_ordinal_type LocalRow, indices, values); } } +#endif template @@ -776,8 +843,8 @@ void OverlappingRowMatrix::describe(Teuchos::FancyOStream &out, << std::setw(width) << nE; if (vl == VERB_EXTREME) { if (isGloballyIndexed()) { - ArrayView rowinds; - ArrayView rowvals; + global_inds_host_view_type rowinds; + values_host_view_type rowvals; getGlobalRowView (gid, rowinds, rowvals); for (size_t j = 0; j < nE; ++j) { out << " (" << rowinds[j] @@ -786,8 +853,8 @@ void OverlappingRowMatrix::describe(Teuchos::FancyOStream &out, } } else if (isLocallyIndexed()) { - ArrayView rowinds; - ArrayView rowvals; + local_inds_host_view_type rowinds; + values_host_view_type rowvals; getLocalRowView (r, rowinds, rowvals); for (size_t j=0; j < nE; ++j) { out << " (" << getColMap()->getGlobalElement(rowinds[j]) diff --git a/packages/ifpack2/src/Ifpack2_Parameters.cpp b/packages/ifpack2/src/Ifpack2_Parameters.cpp index dcd15337c3dd..519d9db3a4c4 100644 --- a/packages/ifpack2/src/Ifpack2_Parameters.cpp +++ b/packages/ifpack2/src/Ifpack2_Parameters.cpp @@ -132,6 +132,7 @@ void getValidParameters(Teuchos::ParameterList& params) params.set("relaxation: banded container superdiagonals", -1); params.set("relaxation: banded container subdiagonals", -1); params.set("relaxation: mtgs cluster size", 1); + params.set("relaxation: long row threshold", 0); // Ifpack2_SPARSKIT.cpp // ap 25 May 2016: all SPARSKIT for backwards compatibility ONLY diff --git a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp index a7be84980ff3..7a067f21a39d 100644 --- a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp @@ -294,17 +294,27 @@ class RILUK: template friend class RILUK; + typedef typename crs_matrix_type::global_inds_host_view_type global_inds_host_view_type; + typedef typename crs_matrix_type::local_inds_host_view_type local_inds_host_view_type; + typedef typename crs_matrix_type::values_host_view_type values_host_view_type; + + + typedef typename crs_matrix_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename crs_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename crs_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type; + + //@} //! \name Implementation of Kokkos Kernels ILU(k). //@{ - typedef typename crs_matrix_type::local_matrix_type local_matrix_type; - typedef typename local_matrix_type::StaticCrsGraphType::row_map_type lno_row_view_t; - typedef typename local_matrix_type::StaticCrsGraphType::entries_type lno_nonzero_view_t; - typedef typename local_matrix_type::values_type scalar_nonzero_view_t; - typedef typename local_matrix_type::StaticCrsGraphType::device_type::memory_space TemporaryMemorySpace; - typedef typename local_matrix_type::StaticCrsGraphType::device_type::memory_space PersistentMemorySpace; - typedef typename local_matrix_type::StaticCrsGraphType::device_type::execution_space HandleExecSpace; + typedef typename crs_matrix_type::local_matrix_device_type local_matrix_device_type; + typedef typename local_matrix_device_type::StaticCrsGraphType::row_map_type lno_row_view_t; + typedef typename local_matrix_device_type::StaticCrsGraphType::entries_type lno_nonzero_view_t; + typedef typename local_matrix_device_type::values_type scalar_nonzero_view_t; + typedef typename local_matrix_device_type::StaticCrsGraphType::device_type::memory_space TemporaryMemorySpace; + typedef typename local_matrix_device_type::StaticCrsGraphType::device_type::memory_space PersistentMemorySpace; + typedef typename local_matrix_device_type::StaticCrsGraphType::device_type::execution_space HandleExecSpace; typedef typename KokkosKernels::Experimental::KokkosKernelsHandle kk_handle_type; diff --git a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp index c404695ea131..335513595984 100644 --- a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp +++ b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp @@ -47,6 +47,8 @@ #include "Ifpack2_LocalSparseTriangularSolver.hpp" #include "Ifpack2_Details_getParamTryingTypes.hpp" #include "Kokkos_Sort.hpp" +#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosKernels_Sorting.hpp" namespace Ifpack2 { @@ -522,14 +524,12 @@ void RILUK::initialize () A_local_->getColMap (), entriesPerRow())); // copy entries into A_local_crs - Teuchos::Array indices(A_local_->getNodeMaxNumRowEntries()); - Teuchos::Array values(A_local_->getNodeMaxNumRowEntries()); + nonconst_local_inds_host_view_type indices("indices",A_local_->getNodeMaxNumRowEntries()); + nonconst_values_host_view_type values("values",A_local_->getNodeMaxNumRowEntries()); for(local_ordinal_type i = 0; i < numRows; i++) { size_t numEntries = 0; - A_local_->getLocalRowCopy(i, indices(), values(), numEntries); - ArrayView indicesInsert(indices.data(), numEntries); - ArrayView valuesInsert(values.data(), numEntries); - A_local_crs_nc->insertLocalValues(i, indicesInsert, valuesInsert); + A_local_->getLocalRowCopy(i, indices, values, numEntries); + A_local_crs_nc->insertLocalValues(i, numEntries, reinterpret_cast(values.data()), indices.data()); } A_local_crs_nc->fillComplete (A_local_->getDomainMap (), A_local_->getRangeMap ()); A_local_crs = rcp_const_cast (A_local_crs_nc); @@ -604,10 +604,10 @@ initAllValues (const row_matrix_type& A) // Allocate temporary space for extracting the strictly // lower and upper parts of the matrix A. - Teuchos::Array InI(MaxNumEntries); + nonconst_local_inds_host_view_type InI("InI",MaxNumEntries); Teuchos::Array LI(MaxNumEntries); Teuchos::Array UI(MaxNumEntries); - Teuchos::Array InV(MaxNumEntries); + nonconst_values_host_view_type InV("InV",MaxNumEntries); Teuchos::Array LV(MaxNumEntries); Teuchos::Array UV(MaxNumEntries); @@ -640,7 +640,7 @@ initAllValues (const row_matrix_type& A) //TODO JJH 4April2014 An optimization is to use getLocalRowView. Not all matrices support this, // we'd need to check via the Tpetra::RowMatrix method supportsRowViews(). - A.getLocalRowCopy (local_row, InI(), InV(), NumIn); // Get Values and Indices + A.getLocalRowCopy (local_row, InI, InV, NumIn); // Get Values and Indices // Split into L and U (we don't assume that indices are ordered). @@ -777,25 +777,30 @@ void RILUK::compute () // Need some integer workspace and pointers size_t NumUU; - Teuchos::ArrayView UUI; - Teuchos::ArrayView UUV; + local_inds_host_view_type UUI; + values_host_view_type UUV; for (size_t j = 0; j < num_cols; ++j) { colflag[j] = -1; } - + using IST = typename row_matrix_type::impl_scalar_type; for (size_t i = 0; i < L_->getNodeNumRows (); ++i) { local_ordinal_type local_row = i; // Fill InV, InI with current row of L, D and U combined NumIn = MaxNumEntries; - L_->getLocalRowCopy (local_row, InI (), InV (), NumL); + nonconst_local_inds_host_view_type InI_v(InI.data(),MaxNumEntries); + nonconst_values_host_view_type InV_v(reinterpret_cast(InV.data()),MaxNumEntries); + + L_->getLocalRowCopy (local_row, InI_v , InV_v, NumL); InV[NumL] = DV(i); // Put in diagonal InI[NumL] = local_row; - U_->getLocalRowCopy (local_row, InI (NumL+1, MaxNumEntries-NumL-1), - InV (NumL+1, MaxNumEntries-NumL-1), NumU); + nonconst_local_inds_host_view_type InI_sub(InI.data()+NumL+1,MaxNumEntries-NumL-1); + nonconst_values_host_view_type InV_sub(reinterpret_cast(InV.data())+NumL+1,MaxNumEntries-NumL-1); + + U_->getLocalRowCopy (local_row, InI_sub,InV_sub, NumU); NumIn = NumL+NumU+1; // Set column flags @@ -807,7 +812,7 @@ void RILUK::compute () for (size_t jj = 0; jj < NumL; ++jj) { local_ordinal_type j = InI[jj]; - scalar_type multiplier = InV[jj]; // current_mults++; + IST multiplier = InV[jj]; // current_mults++; InV[jj] *= static_cast(DV(j)); @@ -821,9 +826,10 @@ void RILUK::compute () // colflag above using size_t (which is generally unsigned), // but now we're querying it using int (which is signed). if (kk > -1) { - InV[kk] -= multiplier * UUV[k]; + InV[kk] -= static_cast(multiplier * UUV[k]); } } + } else { for (size_t k = 0; k < NumUU; ++k) { @@ -832,14 +838,15 @@ void RILUK::compute () // but now we're querying it using int (which is signed). const int kk = colflag[UUI[k]]; if (kk > -1) { - InV[kk] -= multiplier*UUV[k]; + InV[kk] -= static_cast(multiplier*UUV[k]); } else { - diagmod -= multiplier*UUV[k]; + diagmod -= static_cast(multiplier*UUV[k]); } } } } + if (NumL) { // Replace current row of L L_->replaceLocalValues (local_row, InI (0, NumL), InV (0, NumL)); @@ -868,7 +875,7 @@ void RILUK::compute () } if (NumU) { - // Replace current row of L and U + // Replace current row of L and U U_->replaceLocalValues (local_row, InI (NumL+1, NumU), InV (NumL+1, NumU)); } @@ -909,20 +916,18 @@ void RILUK::compute () A_local_->getColMap (), entriesPerRow())); // copy entries into A_local_crs - Teuchos::Array indices(A_local_->getNodeMaxNumRowEntries()); - Teuchos::Array values(A_local_->getNodeMaxNumRowEntries()); + nonconst_local_inds_host_view_type indices("indices",A_local_->getNodeMaxNumRowEntries()); + nonconst_values_host_view_type values("values",A_local_->getNodeMaxNumRowEntries()); for(local_ordinal_type i = 0; i < numRows; i++) { size_t numEntries = 0; - A_local_->getLocalRowCopy(i, indices(), values(), numEntries); - ArrayView indicesInsert(indices.data(), numEntries); - ArrayView valuesInsert(values.data(), numEntries); - A_local_crs_nc->insertLocalValues(i, indicesInsert, valuesInsert); + A_local_->getLocalRowCopy(i, indices, values, numEntries); + A_local_crs_nc->insertLocalValues(i, numEntries, reinterpret_cast(values.data()),indices.data()); } A_local_crs_nc->fillComplete (A_local_->getDomainMap (), A_local_->getRangeMap ()); A_local_crs = rcp_const_cast (A_local_crs_nc); } - A_local_rowmap_ = A_local_crs->getLocalMatrix().graph.row_map; - A_local_entries_ = A_local_crs->getLocalMatrix().graph.entries; + A_local_rowmap_ = A_local_crs->getLocalMatrixDevice().graph.row_map; + A_local_entries_ = A_local_crs->getLocalMatrixDevice().graph.entries; A_local_values_ = A_local_crs->getLocalValuesView(); } @@ -934,13 +939,15 @@ void RILUK::compute () U_->setAllToScalar (STS::zero ()); } - auto L_rowmap = L_->getLocalMatrix().graph.row_map; - auto L_entries = L_->getLocalMatrix().graph.entries; + using row_map_type = typename crs_matrix_type::local_matrix_device_type::row_map_type; + + row_map_type L_rowmap = L_->getLocalMatrixDevice().graph.row_map; + auto L_entries = L_->getLocalMatrixDevice().graph.entries; auto L_values = L_->getLocalValuesView(); - auto U_rowmap = U_->getLocalMatrix().graph.row_map; - auto U_entries = U_->getLocalMatrix().graph.entries; + row_map_type U_rowmap = U_->getLocalMatrixDevice().graph.row_map; + auto U_entries = U_->getLocalMatrixDevice().graph.entries; auto U_values = U_->getLocalValuesView(); - + KokkosSparse::Experimental::spiluk_numeric( KernelHandle_.getRawPtr(), LevelOfFill_, A_local_rowmap_, A_local_entries_, A_local_values_, L_rowmap, L_entries, L_values, U_rowmap, U_entries, U_values ); diff --git a/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp b/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp index 05be9ed133ad..5be4526df414 100644 --- a/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp @@ -208,7 +208,7 @@ option. See the documentation of setParameters() for details. Gauss-Seidel / SOR also comes in a symmetric version. This method first does a Forward sweep, then a Backward sweep. Only the symmetric version of this preconditioner is guaranteed to be symmetric (or Hermitian, -if the matrix's data are complex). +if the matrix data are complex). Users may set the relaxation method via the "relaxation: type" parameter. For all relaxation methods, users may specify the number @@ -617,20 +617,22 @@ class Relaxation : typedef Tpetra::Map map_type; typedef Tpetra::Import import_type; - Teuchos::RCP > invDiagKernel_; + typedef typename crs_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename crs_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type; + Teuchos::RCP > invDiagKernel_; //@} //! \name Implementation of multithreaded Gauss-Seidel. //@{ - typedef typename crs_matrix_type::local_matrix_type local_matrix_type; - typedef typename local_matrix_type::StaticCrsGraphType::row_map_type lno_row_view_t; - typedef typename local_matrix_type::StaticCrsGraphType::entries_type lno_nonzero_view_t; - typedef typename local_matrix_type::values_type scalar_nonzero_view_t; - typedef typename local_matrix_type::StaticCrsGraphType::device_type TemporaryWorkSpace; - typedef typename local_matrix_type::StaticCrsGraphType::device_type PersistentWorkSpace; - typedef typename local_matrix_type::StaticCrsGraphType::execution_space MyExecSpace; + typedef typename crs_matrix_type::local_matrix_device_type local_matrix_device_type; + typedef typename local_matrix_device_type::StaticCrsGraphType::row_map_type lno_row_view_t; + typedef typename local_matrix_device_type::StaticCrsGraphType::entries_type lno_nonzero_view_t; + typedef typename local_matrix_device_type::values_type scalar_nonzero_view_t; + typedef typename local_matrix_device_type::StaticCrsGraphType::device_type TemporaryWorkSpace; + typedef typename local_matrix_device_type::StaticCrsGraphType::device_type PersistentWorkSpace; + typedef typename local_matrix_device_type::StaticCrsGraphType::execution_space MyExecSpace; typedef typename KokkosKernels::Experimental::KokkosKernelsHandle mt_kernel_handle_type; @@ -797,6 +799,8 @@ class Relaxation : bool checkDiagEntries_ = false; //! For MTSGS, the cluster size (use point coloring if equal to 1) int clusterSize_ = 1; + //! For MTSGS, the threshold for long/bulk rows (rows with at least this many nonzeros) + int longRowThreshold_ = 0; //! Number of outer-sweeps for the two-stage Gauss Seidel int NumOuterSweeps_ = 1; diff --git a/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp b/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp index 1eda34ff938a..e72195b98600 100644 --- a/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp @@ -327,6 +327,9 @@ Relaxation::getValidParameters () const const int cluster_size = 1; pl->set("relaxation: mtgs cluster size", cluster_size); + const int long_row_threshold = 0; + pl->set("relaxation: long row threshold", long_row_threshold); + validParams_ = rcp_const_cast (pl); } return validParams_; @@ -367,6 +370,9 @@ void Relaxation::setParametersImpl (Teuchos::ParameterList& pl) int cluster_size = 1; if(pl.isParameter ("relaxation: mtgs cluster size")) //optional parameter cluster_size = pl.get ("relaxation: mtgs cluster size"); + int long_row_threshold = 0; + if(pl.isParameter ("relaxation: long row threshold")) //optional parameter + long_row_threshold = pl.get ("relaxation: long row threshold"); Teuchos::ArrayRCP localSmoothingIndices = pl.get >("relaxation: local smoothing indices"); @@ -378,6 +384,18 @@ void Relaxation::setParametersImpl (Teuchos::ParameterList& pl) pl.remove("relaxation: inner damping factor"); pl.set("relaxation: inner damping factor",df); } + //If long row algorithm was requested, make sure non-cluster (point) multicolor Gauss-Seidel (aka MTGS/MTSGS) will be used. + if (long_row_threshold > 0) { + TEUCHOS_TEST_FOR_EXCEPTION( + cluster_size != 1, std::invalid_argument, "Ifpack2::Relaxation: " + "Requested long row MTGS/MTSGS algorithm and cluster GS/SGS, but those are not compatible."); + TEUCHOS_TEST_FOR_EXCEPTION( + precType != Details::RelaxationType::MTGS && precType != Details::RelaxationType::MTSGS, + std::invalid_argument, "Ifpack2::Relaxation: " + "Requested long row MTGS/MTSGS algorithm, but this is only compatible with preconditioner types " + "'MT Gauss-Seidel' and 'MT Symmetric Gauss-Seidel'."); + } + const ST innerDampingFactor = pl.get ("relaxation: inner damping factor"); const int numInnerSweeps = pl.get ("relaxation: inner sweeps"); const int numOuterSweeps = pl.get ("relaxation: outer sweeps"); @@ -396,6 +414,7 @@ void Relaxation::setParametersImpl (Teuchos::ParameterList& pl) fixTinyDiagEntries_ = fixTinyDiagEntries; checkDiagEntries_ = checkDiagEntries; clusterSize_ = cluster_size; + longRowThreshold_ = long_row_threshold; is_matrix_structurally_symmetric_ = is_matrix_structurally_symmetric; ifpack2_dump_matrix_ = ifpack2_dump_matrix; localSmoothingIndices_ = localSmoothingIndices; @@ -726,12 +745,14 @@ void Relaxation::initialize () if (mtKernelHandle_->get_gs_handle () == nullptr) { if (PrecType_ == Details::GS2 || PrecType_ == Details::SGS2) mtKernelHandle_->create_gs_handle (KokkosSparse::GS_TWOSTAGE); - else if(this->clusterSize_ == 1) + else if(this->clusterSize_ == 1) { mtKernelHandle_->create_gs_handle (); + mtKernelHandle_->get_point_gs_handle()->set_long_row_threshold(longRowThreshold_); + } else mtKernelHandle_->create_gs_handle (KokkosSparse::CLUSTER_DEFAULT, this->clusterSize_); } - local_matrix_type kcsr = crsMat->getLocalMatrix (); + local_matrix_device_type kcsr = crsMat->getLocalMatrixDevice (); if (PrecType_ == Details::GS2 || PrecType_ == Details::SGS2) { // set parameters for two-stage GS mtKernelHandle_->set_gs_set_num_inner_sweeps (NumInnerSweeps_); @@ -895,13 +916,14 @@ void Relaxation::computeBlockCrs () if (DoL1Method_ && IsParallel_) { const scalar_type two = one + one; const size_t maxLength = A_->getNodeMaxNumRowEntries (); - Array indices (maxLength); - Array values (maxLength * blockSize * blockSize); + nonconst_local_inds_host_view_type indices ("indices",maxLength); + nonconst_values_host_view_type values_ ("values",maxLength * blockSize * blockSize); size_t numEntries = 0; for (LO i = 0; i < lclNumMeshRows; ++i) { // FIXME (mfh 16 Dec 2015) Get views instead of copies. - blockCrsA->getLocalRowCopy (i, indices (), values (), numEntries); + blockCrsA->getLocalRowCopy (i, indices, values_, numEntries); + scalar_type * values = reinterpret_cast(values_.data()); auto diagBlock = Kokkos::subview (blockDiag, i, ALL (), ALL ()); for (LO subRow = 0; subRow < blockSize; ++subRow) { @@ -1226,12 +1248,12 @@ void Relaxation::compute () auto diag = Diagonal->getLocalViewHost(Tpetra::Access::ReadWrite); const magnitude_type two = STM::one () + STM::one (); const size_t maxLength = A_row.getNodeMaxNumRowEntries (); - Array indices (maxLength); - Array values (maxLength); + nonconst_local_inds_host_view_type indices("indices",maxLength); + nonconst_values_host_view_type values("values",maxLength); size_t numEntries; for (LO i = 0; i < numMyRows; ++i) { - A_row.getLocalRowCopy (i, indices (), values (), numEntries); + A_row.getLocalRowCopy (i, indices, values, numEntries); magnitude_type diagonal_boost = STM::zero (); for (size_t k = 0 ; k < numEntries; ++k) { if (indices[k] >= numMyRows) { @@ -1304,7 +1326,7 @@ void Relaxation::compute () (crsMat == nullptr, std::logic_error, methodName << ": " "Multithreaded Gauss-Seidel methods currently only work " "when the input matrix is a Tpetra::CrsMatrix."); - local_matrix_type kcsr = crsMat->getLocalMatrix (); + local_matrix_device_type kcsr = crsMat->getLocalMatrixDevice (); //TODO BMK: This should be ReadOnly, and KokkosKernels should accept a //const-valued view for user-provided D^-1. OK for now, Diagonal_ is nonconst. @@ -2080,7 +2102,7 @@ ApplyInverseMTGS_CrsMatrix( */ } - local_matrix_type kcsr = crsMat->getLocalMatrix (); + local_matrix_device_type kcsr = crsMat->getLocalMatrixDevice (); bool update_y_vector = true; //false as it was done up already, and we dont want to zero it in each sweep. diff --git a/packages/ifpack2/src/Ifpack2_ReorderFilter_decl.hpp b/packages/ifpack2/src/Ifpack2_ReorderFilter_decl.hpp index 33a928a8990e..53d039b10181 100644 --- a/packages/ifpack2/src/Ifpack2_ReorderFilter_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_ReorderFilter_decl.hpp @@ -73,6 +73,14 @@ class ReorderFilter : typedef typename MatrixType::local_ordinal_type local_ordinal_type; typedef typename MatrixType::global_ordinal_type global_ordinal_type; typedef typename MatrixType::node_type node_type; + typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type; + typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type; + typedef typename MatrixType::values_host_view_type values_host_view_type; + + typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type; + typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; typedef Tpetra::RowMatrix::invalid(). */ + virtual void + getGlobalRowCopy (global_ordinal_type GlobalRow, + nonconst_global_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowCopy(global_ordinal_type GlobalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const; - +#endif //! Extract a list of entries in a specified local row of the graph. Put into storage allocated by calling routine. /*! \param LocalRow - (In) Local row number for which indices are desired. @@ -225,10 +239,17 @@ class ReorderFilter : with row \c LocalRow. If \c LocalRow is not valid for this node, then \c Indices and \c Values are unchanged and \c NumIndices is returned as Teuchos::OrdinalTraits::invalid(). */ + virtual void + getLocalRowCopy (local_ordinal_type LocalRow, + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowCopy(local_ordinal_type DropRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const ; +#endif //! Extract a const, non-persisting view of global indices in a specified row of the matrix. /*! @@ -239,10 +260,15 @@ class ReorderFilter : \pre isLocallyIndexed() == false Note: If \c GlobalRow does not belong to this node, then \c indices is set to null. */ + virtual void + getGlobalRowView (global_ordinal_type GlobalRow, + global_inds_host_view_type &indices, + values_host_view_type &values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowView(global_ordinal_type GlobalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; - +#endif //! Extract a const, non-persisting view of local indices in a specified row of the matrix. /*! \param LocalRow - (In) Local row number for which indices are desired. @@ -253,10 +279,16 @@ class ReorderFilter : Note: If \c LocalRow does not belong to this node, then \c indices is set to null. */ + virtual void + getLocalRowView (local_ordinal_type LocalRow, + local_inds_host_view_type & indices, + values_host_view_type & values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE + virtual void getLocalRowView(local_ordinal_type LocalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; - +#endif //! \brief Get a copy of the diagonal entries owned by this node, with local row indices. /*! Returns a distributed Vector object partitioned according to this matrix's row map, containing the the zero and non-zero diagonals owned by this node. */ @@ -352,11 +384,10 @@ class ReorderFilter : //! Permutation: Reordered to original Teuchos::ArrayRCP reverseperm_; - //! Used in apply, to avoid allocation each time. - mutable Teuchos::Array Indices_; - //! Used in apply, to avoid allocation each time. - mutable Teuchos::Array Values_; - + //! Used in ExtractMyRowCopy, to avoid allocation each time. + mutable nonconst_local_inds_host_view_type Indices_; + //! Used in ExtractMyRowCopy, to avoid allocation each time. + mutable nonconst_values_host_view_type Values_; };// class ReorderFilter }// namespace Ifpack2 diff --git a/packages/ifpack2/src/Ifpack2_ReorderFilter_def.hpp b/packages/ifpack2/src/Ifpack2_ReorderFilter_def.hpp index 1d3671d32f3a..0a216dd96103 100644 --- a/packages/ifpack2/src/Ifpack2_ReorderFilter_def.hpp +++ b/packages/ifpack2/src/Ifpack2_ReorderFilter_def.hpp @@ -79,8 +79,8 @@ ReorderFilter (const Teuchos::RCP& A, "Ifpack2::ReorderFilter: The input matrix is not square."); // Temp arrays for apply - Indices_.resize (A_->getNodeMaxNumRowEntries ()); - Values_.resize (A_->getNodeMaxNumRowEntries ()); + Kokkos::resize(Indices_,A_->getNodeMaxNumRowEntries ()); + Kokkos::resize(Values_,A_->getNodeMaxNumRowEntries ()); } @@ -286,10 +286,10 @@ bool ReorderFilter::isFillComplete() const template void ReorderFilter:: -getGlobalRowCopy (global_ordinal_type globalRow, - const Teuchos::ArrayView& globalInd, - const Teuchos::ArrayView& val, - size_t& numEntries) const + getGlobalRowCopy (global_ordinal_type globalRow, + nonconst_global_inds_host_view_type &globalInd, + nonconst_values_host_view_type &val, + size_t& numEntries) const { using Teuchos::Array; using Teuchos::ArrayView; @@ -306,37 +306,38 @@ getGlobalRowCopy (global_ordinal_type globalRow, << " is not owned by the calling process with rank " << rowMap.getComm ()->getRank () << "."); - if (sizeof (GO) == sizeof (LO)) { - // This means we can convert local to global in place. - ArrayView localInd = av_reinterpret_cast (globalInd); - this->getLocalRowCopy (localRow, localInd, val, numEntries); + // The Indices_ temp array is only used in apply, not getLocalRowCopy, so this is safe + numEntries = this->getNumEntriesInLocalRow (localRow); + this->getLocalRowCopy (localRow, Indices_, val, numEntries); - // Convert local indices back to global indices. - for (size_t k = 0; k < numEntries; ++k) { - globalInd[k] = rowMap.getGlobalElement (localInd[k]); - } - } - else { - // LO and GO have different sizes, so we need a temp array - // for converting local to global. - numEntries = this->getNumEntriesInLocalRow (localRow); - Array localInd (numEntries); - this->getLocalRowCopy (localRow, localInd, val, numEntries); - - // Convert local indices back to global indices. - for (size_t k = 0; k < numEntries; ++k) { - globalInd[k] = rowMap.getGlobalElement (localInd[k]); - } + // Convert local indices back to global indices. + for (size_t k = 0; k < numEntries; ++k) { + globalInd[k] = rowMap.getGlobalElement (Indices_[k]); } } +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void ReorderFilter:: +getGlobalRowCopy (global_ordinal_type globalRow, + const Teuchos::ArrayView& Indices, + const Teuchos::ArrayView& Values, + size_t& numEntries) const +{ + using IST = typename row_matrix_type::impl_scalar_type; + nonconst_global_inds_host_view_type ind_in(Indices.data(),Indices.size()); + nonconst_values_host_view_type val_in(reinterpret_cast(Values.data()),Values.size()); + getGlobalRowCopy(globalRow,ind_in,val_in,numEntries); +} +#endif template void ReorderFilter:: getLocalRowCopy (local_ordinal_type LocalRow, - const Teuchos::ArrayView &Indices, - const Teuchos::ArrayView &Values, - size_t &NumEntries) const + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const + { TEUCHOS_TEST_FOR_EXCEPTION( ! A_->getRowMap ()->isNodeLocalElement (LocalRow), @@ -370,7 +371,29 @@ getLocalRowCopy (local_ordinal_type LocalRow, } } +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void ReorderFilter::getLocalRowCopy (local_ordinal_type LocalRow, + const Teuchos::ArrayView &Indices, + const Teuchos::ArrayView &Values, + size_t &NumEntries) const +{ + using IST = typename row_matrix_type::impl_scalar_type; + nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size()); + nonconst_values_host_view_type val_in(reinterpret_cast(Values.data()),Values.size()); + getLocalRowCopy(LocalRow,ind_in,val_in,NumEntries); +} +#endif + +template +void ReorderFilter::getGlobalRowView(global_ordinal_type /* GlobalRow */, + global_inds_host_view_type &/*indices*/, + values_host_view_type &/*values*/) const +{ + throw std::runtime_error("Ifpack2::ReorderFilter: does not support getGlobalRowView."); +} +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void ReorderFilter:: getGlobalRowView (global_ordinal_type /* GlobalRow */, @@ -379,8 +402,18 @@ getGlobalRowView (global_ordinal_type /* GlobalRow */, { throw std::runtime_error("Ifpack2::ReorderFilter: does not support getGlobalRowView."); } +#endif +template +void ReorderFilter::getLocalRowView(local_ordinal_type /* LocalRow */, + local_inds_host_view_type & /*indices*/, + values_host_view_type & /*values*/) const +{ + throw std::runtime_error("Ifpack2::ReorderFilter: does not support getLocalRowView."); +} + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void ReorderFilter:: getLocalRowView (local_ordinal_type /* LocalRow */, @@ -389,6 +422,7 @@ getLocalRowView (local_ordinal_type /* LocalRow */, { throw std::runtime_error("Ifpack2::ReorderFilter: does not support getLocalRowView."); } +#endif template @@ -445,25 +479,26 @@ apply (const Tpetra::MultiVectorgetNodeNumRows (); ++i) { size_t Nnz; // Use this class's getrow to make the below code simpler - getLocalRowCopy (i, Indices_ (), Values_ (), Nnz); + getLocalRowCopy (i, Indices_ , Values_ , Nnz); + scalar_type* Values = reinterpret_cast(Values_.data()); if (mode == Teuchos::NO_TRANS) { for (size_t j = 0; j < Nnz; ++j) { for (size_t k = 0; k < NumVectors; ++k) { - y_ptr[k][i] += Values_[j] * x_ptr[k][Indices_[j]]; + y_ptr[k][i] += Values[j] * x_ptr[k][Indices_[j]]; } } } else if (mode == Teuchos::TRANS) { for (size_t j = 0; j < Nnz; ++j) { for (size_t k = 0; k < NumVectors; ++k) { - y_ptr[k][Indices_[j]] += Values_[j] * x_ptr[k][i]; + y_ptr[k][Indices_[j]] += Values[j] * x_ptr[k][i]; } } } else { //mode==Teuchos::CONJ_TRANS for (size_t j = 0; j < Nnz; ++j) { for (size_t k = 0; k < NumVectors; ++k) { - y_ptr[k][Indices_[j]] += STS::conjugate(Values_[j]) * x_ptr[k][i]; + y_ptr[k][Indices_[j]] += STS::conjugate(Values[j]) * x_ptr[k][i]; } } } diff --git a/packages/ifpack2/src/Ifpack2_SingletonFilter_decl.hpp b/packages/ifpack2/src/Ifpack2_SingletonFilter_decl.hpp index d454ab06aa8c..d540bfdccad7 100644 --- a/packages/ifpack2/src/Ifpack2_SingletonFilter_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_SingletonFilter_decl.hpp @@ -68,6 +68,14 @@ class SingletonFilter : typedef typename MatrixType::local_ordinal_type LocalOrdinal; typedef typename MatrixType::global_ordinal_type GlobalOrdinal; typedef typename MatrixType::node_type Node; + typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type; + typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type; + typedef typename MatrixType::values_host_view_type values_host_view_type; + + typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type; + typedef typename Teuchos::ScalarTraits::magnitudeType magnitudeType; typedef Tpetra::RowMatrix row_matrix_type; typedef typename row_matrix_type::mag_type mag_type; @@ -173,11 +181,17 @@ class SingletonFilter : with row \c GlobalRow. If \c GlobalRow does not belong to this node, then \c Indices and \c Values are unchanged and \c NumIndices is returned as Teuchos::OrdinalTraits::invalid(). */ + virtual void + getGlobalRowCopy (GlobalOrdinal GlobalRow, + nonconst_global_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const; - +#endif //! Extract a list of entries in a specified local row of the graph. Put into storage allocated by calling routine. /*! \param LocalRow - (In) Local row number for which indices are desired. @@ -189,11 +203,17 @@ class SingletonFilter : with row \c LocalRow. If \c LocalRow is not valid for this node, then \c Indices and \c Values are unchanged and \c NumIndices is returned as Teuchos::OrdinalTraits::invalid(). */ + virtual void + getLocalRowCopy (LocalOrdinal LocalRow, + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowCopy(LocalOrdinal LocalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const ; - +#endif //! Extract a const, non-persisting view of global indices in a specified row of the matrix. /*! \param GlobalRow - (In) Global row number for which indices are desired. @@ -204,10 +224,15 @@ class SingletonFilter : Note: If \c GlobalRow does not belong to this node, then \c indices is set to null. */ + virtual void + getGlobalRowView (GlobalOrdinal GlobalRow, + global_inds_host_view_type &indices, + values_host_view_type &values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowView(GlobalOrdinal GlobalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; - +#endif //! Extract a const, non-persisting view of local indices in a specified row of the matrix. /*! \param LocalRow - (In) Local row number for which indices are desired. @@ -218,10 +243,15 @@ class SingletonFilter : Note: If \c LocalRow does not belong to this node, then \c indices is set to null. */ + virtual void + getLocalRowView (LocalOrdinal LocalRow, + local_inds_host_view_type & indices, + values_host_view_type & values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowView(LocalOrdinal LocalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; - +#endif //! \brief Get a copy of the diagonal entries owned by this node, with local row indices. /*! Returns a distributed Vector object partitioned according to this matrix's row map, containing the the zero and non-zero diagonals owned by this node. */ @@ -330,9 +360,9 @@ class SingletonFilter : //! NumEntries_[i] contains the nonzero entries in row `i'. std::vector NumEntries_; //! Used in ExtractMyRowCopy, to avoid allocation each time. - mutable Teuchos::Array Indices_; + mutable nonconst_local_inds_host_view_type Indices_; //! Used in ExtractMyRowCopy, to avoid allocation each time. - mutable Teuchos::Array Values_; + mutable nonconst_values_host_view_type Values_; };// class SingletonFilter }// namespace Ifpack2 diff --git a/packages/ifpack2/src/Ifpack2_SingletonFilter_def.hpp b/packages/ifpack2/src/Ifpack2_SingletonFilter_def.hpp index 2a08ba929665..4390df0818a8 100644 --- a/packages/ifpack2/src/Ifpack2_SingletonFilter_def.hpp +++ b/packages/ifpack2/src/Ifpack2_SingletonFilter_def.hpp @@ -75,8 +75,8 @@ SingletonFilter::SingletonFilter(const Teuchos::RCPgetNodeMaxNumRowEntries(); // ExtractMyRowCopy() will use these vectors - Indices_.resize(MaxNumEntriesA_); - Values_.resize(MaxNumEntriesA_); + Kokkos::resize(Indices_,MaxNumEntriesA_); + Kokkos::resize(Values_,MaxNumEntriesA_); // Initialize reordering vector to -1 Reorder_.resize(NumRowsA_); @@ -285,6 +285,17 @@ bool SingletonFilter::isFillComplete() const return A_->isFillComplete(); } +template +void SingletonFilter:: +getGlobalRowCopy (GlobalOrdinal /*LocalRow*/, + nonconst_global_inds_host_view_type &/*Indices*/, + nonconst_values_host_view_type &/*Values*/, + size_t& /*NumEntries*/) const +{ + throw std::runtime_error("Ifpack2::SingletonFilter does not implement getGlobalRowCopy."); +} + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void SingletonFilter::getGlobalRowCopy(GlobalOrdinal /* GlobalRow */, const Teuchos::ArrayView &/* Indices */, @@ -293,18 +304,20 @@ void SingletonFilter::getGlobalRowCopy(GlobalOrdinal /* GlobalRow */ { throw std::runtime_error("Ifpack2::SingletonFilter does not implement getGlobalRowCopy."); } +#endif template -void SingletonFilter::getLocalRowCopy(LocalOrdinal LocalRow, - const Teuchos::ArrayView &Indices, - const Teuchos::ArrayView &Values, - size_t &NumEntries) const +void SingletonFilter:: + getLocalRowCopy (LocalOrdinal LocalRow, + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const { TEUCHOS_TEST_FOR_EXCEPTION((LocalRow < 0 || (size_t) LocalRow >= NumRows_ || (size_t) Indices.size() < NumEntries_[LocalRow]), std::runtime_error, "Ifpack2::SingletonFilter::getLocalRowCopy invalid row or array size."); size_t Nnz; LocalOrdinal ARow = InvReorder_[LocalRow]; - A_->getLocalRowCopy(ARow,Indices_(),Values_(),Nnz); + A_->getLocalRowCopy(ARow,Indices_,Values_,Nnz); // populate the user's vectors NumEntries = 0; @@ -316,9 +329,32 @@ void SingletonFilter::getLocalRowCopy(LocalOrdinal LocalRow, NumEntries++; } } +} + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void SingletonFilter::getLocalRowCopy(LocalOrdinal LocalRow, + const Teuchos::ArrayView &Indices, + const Teuchos::ArrayView &Values, + size_t &NumEntries) const +{ + using IST = typename row_matrix_type::impl_scalar_type; + nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size()); + nonconst_values_host_view_type val_in(reinterpret_cast(Values.data()),Values.size()); + getLocalRowCopy(LocalRow,ind_in,val_in,NumEntries); +} +#endif + +template +void SingletonFilter::getGlobalRowView(GlobalOrdinal /* GlobalRow */, + global_inds_host_view_type &/*indices*/, + values_host_view_type &/*values*/) const +{ + throw std::runtime_error("Ifpack2::SingletonFilter: does not support getGlobalRowView."); } +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void SingletonFilter::getGlobalRowView(GlobalOrdinal /* GlobalRow */, Teuchos::ArrayView &/* indices */, @@ -326,7 +362,17 @@ void SingletonFilter::getGlobalRowView(GlobalOrdinal /* GlobalRow */ { throw std::runtime_error("Ifpack2::SingletonFilter: does not support getGlobalRowView."); } +#endif + +template +void SingletonFilter::getLocalRowView(LocalOrdinal /* LocalRow */, + local_inds_host_view_type & /*indices*/, + values_host_view_type & /*values*/) const +{ + throw std::runtime_error("Ifpack2::SingletonFilter: does not support getLocalRowView."); +} +#ifdef TPETRA_ENABLE_DEPRECATED_CODE template void SingletonFilter::getLocalRowView(LocalOrdinal /* LocalRow */, Teuchos::ArrayView &/* indices */, @@ -334,6 +380,7 @@ void SingletonFilter::getLocalRowView(LocalOrdinal /* LocalRow */, { throw std::runtime_error("Ifpack2::SingletonFilter: does not support getLocalRowView."); } +#endif template void SingletonFilter::getLocalDiagCopy(Tpetra::Vector &diag) const @@ -380,7 +427,7 @@ void SingletonFilter::apply(const Tpetra::MultiVector::SolveSingletonsTempl(const Tpetra::MultiVector LocalOrdinal ii = SingletonIndex_[i]; // get the diagonal value for the singleton size_t Nnz; - A_->getLocalRowCopy(ii,Indices_(),Values_(),Nnz); + A_->getLocalRowCopy(ii,Indices_,Values_,Nnz); for (size_t j = 0 ; j < Nnz ; ++j) { if (Indices_[j] == ii) { for (size_t k = 0 ; k < LHS.getNumVectors() ; ++k) @@ -467,7 +514,7 @@ void SingletonFilter::CreateReducedRHSTempl(const Tpetra::MultiVecto for (size_t i = 0 ; i < NumRows_ ; ++i) { LocalOrdinal ii = InvReorder_[i]; size_t Nnz; - A_->getLocalRowCopy(ii,Indices_(),Values_(),Nnz); + A_->getLocalRowCopy(ii,Indices_,Values_,Nnz); for (size_t j = 0 ; j < Nnz ; ++j) { if (Reorder_[Indices_[j]] == -1) { diff --git a/packages/ifpack2/src/Ifpack2_SparseContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_SparseContainer_decl.hpp index c931c621e870..a7eb5a2489f1 100644 --- a/packages/ifpack2/src/Ifpack2_SparseContainer_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_SparseContainer_decl.hpp @@ -167,6 +167,8 @@ class SparseContainer using InverseGlobalOrdinal = typename InverseType::global_ordinal_type; using InverseNode = typename InverseType::node_type; + using typename ContainerImpl::block_crs_matrix_type; + using inverse_mv_type = Tpetra::MultiVector; using InverseCrs = Tpetra::CrsMatrix; using InverseMap = typename Tpetra::Map; diff --git a/packages/ifpack2/src/Ifpack2_SparseContainer_def.hpp b/packages/ifpack2/src/Ifpack2_SparseContainer_def.hpp index b83700ff0dd2..75c3d81e9090 100644 --- a/packages/ifpack2/src/Ifpack2_SparseContainer_def.hpp +++ b/packages/ifpack2/src/Ifpack2_SparseContainer_def.hpp @@ -529,14 +529,16 @@ extract () Array rowEntryCounts(blockPointSize, 0); //blockRow counts the BlockCrs LIDs that are going into this block //Rows are inserted into the CrsMatrix in sequential order + using inds_type = typename block_crs_matrix_type::local_inds_host_view_type; + using vals_type = typename block_crs_matrix_type::values_host_view_type; for(LO blockRow = 0; blockRow < blockSize; blockRow++) { //get a raw view of the whole block row - const LO* indices; - SC* values; - LO numEntries; + inds_type indices; + vals_type values; LO inputRow = this->blockRows_[blockStart + blockRow]; - this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values, numEntries); + this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values); + LO numEntries = (LO) indices.size(); for(LO br = 0; br < this->bcrsBlockSize_; br++) { for(LO k = 0; k < numEntries; k++) @@ -557,11 +559,11 @@ extract () for(LO blockRow = 0; blockRow < blockSize; blockRow++) { //get a raw view of the whole block row - const LO* indices; - SC* values; - LO numEntries; + inds_type indices; + vals_type values; LO inputRow = this->blockRows_[blockStart + blockRow]; - this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values, numEntries); + this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values); + LO numEntries = (LO) indices.size(); for(LO br = 0; br < this->bcrsBlockSize_; br++) { indicesToInsert.clear(); diff --git a/packages/ifpack2/src/Ifpack2_SparsityFilter_decl.hpp b/packages/ifpack2/src/Ifpack2_SparsityFilter_decl.hpp index 8a0c27499f2e..1d3ff1b4d600 100644 --- a/packages/ifpack2/src/Ifpack2_SparsityFilter_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_SparsityFilter_decl.hpp @@ -86,6 +86,14 @@ class SparsityFilter : typedef typename MatrixType::local_ordinal_type LocalOrdinal; typedef typename MatrixType::global_ordinal_type GlobalOrdinal; typedef typename MatrixType::node_type Node; + typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type; + typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type; + typedef typename MatrixType::values_host_view_type values_host_view_type; + + typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type; + typedef typename Teuchos::ScalarTraits::magnitudeType magnitudeType; typedef Tpetra::RowMatrix row_matrix_type; typedef typename row_matrix_type::mag_type mag_type; @@ -192,11 +200,17 @@ class SparsityFilter : with row \c GlobalRow. If \c GlobalRow does not belong to this node, then \c Indices and \c Values are unchanged and \c NumIndices is returned as Teuchos::OrdinalTraits::invalid(). */ + virtual void + getGlobalRowCopy (GlobalOrdinal GlobalRow, + nonconst_global_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const; - +#endif //! Extract a list of entries in a specified local row of the graph. Put into storage allocated by calling routine. /*! \param DropRow - (In) Drop row number for which indices are desired. @@ -208,11 +222,18 @@ class SparsityFilter : with row \c DropRow. If \c DropRow is not valid for this node, then \c Indices and \c Values are unchanged and \c NumIndices is returned as Teuchos::OrdinalTraits::invalid(). */ + + virtual void + getLocalRowCopy (LocalOrdinal LocalRow, + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowCopy(LocalOrdinal DropRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const ; - +#endif //! Extract a const, non-persisting view of global indices in a specified row of the matrix. /*! \param GlobalRow - (In) Global row number for which indices are desired. @@ -223,10 +244,15 @@ class SparsityFilter : Note: If \c GlobalRow does not belong to this node, then \c indices is set to null. */ + virtual void + getGlobalRowView (GlobalOrdinal GlobalRow, + global_inds_host_view_type &indices, + values_host_view_type &values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowView(GlobalOrdinal GlobalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; - +#endif //! Extract a const, non-persisting view of local indices in a specified row of the matrix. /*! \param DropRow - (In) Drop row number for which indices are desired. @@ -237,9 +263,15 @@ class SparsityFilter : Note: If \c DropRow does not belong to this node, then \c indices is set to null. */ + virtual void + getLocalRowView (LocalOrdinal LocalRow, + local_inds_host_view_type & indices, + values_host_view_type & values) const; +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowView(LocalOrdinal DropRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const; +#endif //! \brief Get a copy of the diagonal entries owned by this node, with local row indices. /*! Returns a distributed Vector object partitioned according to this matrix's row map, containing the @@ -315,9 +347,9 @@ class SparsityFilter : //! NumEntries_[i] contains the nonzero entries in row `i'. std::vector NumEntries_; //! Used in ExtractMyRowCopy, to avoid allocation each time. - mutable Teuchos::Array Indices_; + mutable nonconst_local_inds_host_view_type Indices_; //! Used in ExtractMyRowCopy, to avoid allocation each time - mutable Teuchos::Array Values_; + mutable nonconst_values_host_view_type Values_; };// class SparsityFilter diff --git a/packages/ifpack2/src/Ifpack2_SparsityFilter_def.hpp b/packages/ifpack2/src/Ifpack2_SparsityFilter_def.hpp index c0ae66b9e8a4..814420e2b019 100644 --- a/packages/ifpack2/src/Ifpack2_SparsityFilter_def.hpp +++ b/packages/ifpack2/src/Ifpack2_SparsityFilter_def.hpp @@ -86,8 +86,8 @@ SparsityFilter::SparsityFilter(const Teuchos::RCPgetNodeMaxNumRowEntries(); // ExtractMyRowCopy() will use these vectors - Indices_.resize(MaxNumEntries_); - Values_.resize(MaxNumEntries_); + Kokkos::resize(Indices_,MaxNumEntries_); + Kokkos::resize(Values_,MaxNumEntries_); size_t ActualMaxNumEntries = 0; for (size_t i = 0 ; i < NumRows_ ; ++i) { @@ -274,22 +274,36 @@ bool SparsityFilter::isFillComplete() const //========================================================================== template -void SparsityFilter::getGlobalRowCopy(GlobalOrdinal /* GlobalRow */, +void SparsityFilter:: +getGlobalRowCopy (GlobalOrdinal /*GlobalRow*/, + nonconst_global_inds_host_view_type &/*Indices*/, + nonconst_values_host_view_type &/*Values*/, + size_t& /*NumEntries*/) const { + throw std::runtime_error("Ifpack2::SparsityFilter does not implement getGlobalRowCopy."); +} + +//========================================================================== +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void SparsityFilter:: +getGlobalRowCopy(GlobalOrdinal /* GlobalRow */, const Teuchos::ArrayView &/* Indices */, const Teuchos::ArrayView &/* Values */, size_t &/* NumEntries */) const { throw std::runtime_error("Ifpack2::SparsityFilter does not implement getGlobalRowCopy."); } +#endif //========================================================================== template -void SparsityFilter::getLocalRowCopy(LocalOrdinal LocalRow, - const Teuchos::ArrayView &Indices, - const Teuchos::ArrayView &Values, - size_t &NumEntries) const +void SparsityFilter:: + getLocalRowCopy (LocalOrdinal LocalRow, + nonconst_local_inds_host_view_type &Indices, + nonconst_values_host_view_type &Values, + size_t& NumEntries) const { - TEUCHOS_TEST_FOR_EXCEPTION((LocalRow < 0 || (size_t) LocalRow >= NumRows_ || (size_t) Indices.size() < NumEntries_[LocalRow]), std::runtime_error, "Ifpack2::SparsityFilter::getLocalRowCopy invalid row or array size."); +TEUCHOS_TEST_FOR_EXCEPTION((LocalRow < 0 || (size_t) LocalRow >= NumRows_ || (size_t) Indices.size() < NumEntries_[LocalRow]), std::runtime_error, "Ifpack2::SparsityFilter::getLocalRowCopy invalid row or array size."); // Note: This function will work correctly if called by apply, say, with Indices_ and Values_ as // parameters. The structure of the loop below should make that obvious. @@ -298,7 +312,7 @@ void SparsityFilter::getLocalRowCopy(LocalOrdinal LocalRow, // This is because I need more space than that given by // the user (for the external nodes) size_t A_NumEntries=0; - A_->getLocalRowCopy(LocalRow,Indices_(),Values_(),A_NumEntries); + A_->getLocalRowCopy(LocalRow,Indices_,Values_,A_NumEntries); magnitudeType Threshold = Teuchos::ScalarTraits::zero(); std::vector Values2(A_NumEntries,Teuchos::ScalarTraits::zero()); @@ -339,25 +353,61 @@ void SparsityFilter::getLocalRowCopy(LocalOrdinal LocalRow, break; } + } +//========================================================================== +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template +void SparsityFilter:: +getLocalRowCopy(LocalOrdinal DropRow, + const Teuchos::ArrayView &Indices, + const Teuchos::ArrayView &Values, + size_t &NumEntries) const +{ + using IST = typename row_matrix_type::impl_scalar_type; + nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size()); + nonconst_values_host_view_type val_in(reinterpret_cast(Values.data()),Values.size()); + getLocalRowCopy(DropRow,ind_in,val_in,NumEntries); +} +#endif + //========================================================================== template +void SparsityFilter::getGlobalRowView(GlobalOrdinal /* GlobalRow */, + global_inds_host_view_type &/*indices*/, + values_host_view_type &/*values*/) const +{ + throw std::runtime_error("Ifpack2::SparsityFilter: does not support getGlobalRowView."); +} + +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template void SparsityFilter::getGlobalRowView(GlobalOrdinal /* GlobalRow */, Teuchos::ArrayView &/* indices */, Teuchos::ArrayView &/* values */) const { throw std::runtime_error("Ifpack2::SparsityFilter: does not support getGlobalRowView."); } +#endif //========================================================================== template +void SparsityFilter::getLocalRowView(LocalOrdinal /* LocalRow */, + local_inds_host_view_type & /*indices*/, + values_host_view_type & /*values*/) const +{ + throw std::runtime_error("Ifpack2::SparsityFilter: does not support getLocalRowView."); +} +#ifdef TPETRA_ENABLE_DEPRECATED_CODE +template void SparsityFilter::getLocalRowView(LocalOrdinal /* LocalRow */, Teuchos::ArrayView &/* indices */, Teuchos::ArrayView &/* values */) const { throw std::runtime_error("Ifpack2::SparsityFilter: does not support getLocalRowView."); } +#endif //========================================================================== template @@ -404,21 +454,22 @@ void SparsityFilter::apply(const Tpetra::MultiVector(Values_.data()); if (mode==Teuchos::NO_TRANS){ for (size_t j = 0 ; j < Nnz ; ++j) for (size_t k = 0 ; k < NumVectors ; ++k) - y_ptr[k][i] += Values_[j] * x_ptr[k][Indices_[j]]; + y_ptr[k][i] += Values[j] * x_ptr[k][Indices_[j]]; } else if (mode==Teuchos::TRANS){ for (size_t j = 0 ; j < Nnz ; ++j) for (size_t k = 0 ; k < NumVectors ; ++k) - y_ptr[k][Indices_[j]] += Values_[j] * x_ptr[k][i]; + y_ptr[k][Indices_[j]] += Values[j] * x_ptr[k][i]; } else { //mode==Teuchos::CONJ_TRANS for (size_t j = 0 ; j < Nnz ; ++j) for (size_t k = 0 ; k < NumVectors ; ++k) - y_ptr[k][Indices_[j]] += Teuchos::ScalarTraits::conjugate(Values_[j]) * x_ptr[k][i]; + y_ptr[k][Indices_[j]] += Teuchos::ScalarTraits::conjugate(Values[j]) * x_ptr[k][i]; } } } diff --git a/packages/ifpack2/src/Ifpack2_TriDiContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_TriDiContainer_decl.hpp index df2d0aca7e76..014ef0ed2f0d 100644 --- a/packages/ifpack2/src/Ifpack2_TriDiContainer_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_TriDiContainer_decl.hpp @@ -138,7 +138,7 @@ class TriDiContainer using HostViewLocal = typename Kokkos::View; using typename ContainerImpl::HostSubviewLocal; using typename ContainerImpl::ConstHostSubviewLocal; - + using typename ContainerImpl::block_crs_matrix_type; static_assert (std::is_same>::value, "Ifpack2::TriDiContainer: MatrixType must be a Tpetra::RowMatrix specialization."); diff --git a/packages/ifpack2/src/Ifpack2_TriDiContainer_def.hpp b/packages/ifpack2/src/Ifpack2_TriDiContainer_def.hpp index a6f345273f47..ca53ad31bce1 100644 --- a/packages/ifpack2/src/Ifpack2_TriDiContainer_def.hpp +++ b/packages/ifpack2/src/Ifpack2_TriDiContainer_def.hpp @@ -161,14 +161,16 @@ void TriDiContainer::extract() LO localCol = this->translateRowToCol(blockRows[j]); colToBlockOffset[localCol] = blockStart + j; } + using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type; + using h_vals_type = typename block_crs_matrix_type::values_host_view_type; for(LO blockRow = 0; blockRow < LO(blockRows.size()); blockRow++) { //get a raw view of the whole block row - const LO* indices; - SC* values; - LO numEntries; + h_inds_type indices; + h_vals_type values; LO inputRow = this->blockRows_[blockStart + blockRow]; - this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values, numEntries); + this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values); + LO numEntries = (LO) indices.size(); for(LO k = 0; k < numEntries; k++) { LO colOffset = colToBlockOffset[indices[k]]; diff --git a/packages/ifpack2/src/Ifpack2_Utilities.cpp b/packages/ifpack2/src/Ifpack2_Utilities.cpp index 2e3e8807b5ab..e7f4cbc15eab 100644 --- a/packages/ifpack2/src/Ifpack2_Utilities.cpp +++ b/packages/ifpack2/src/Ifpack2_Utilities.cpp @@ -49,9 +49,8 @@ namespace Details { // precTypeUpper is the upper-case version of precType. std::string precTypeUpper (precType); if (precTypeUpper.size () > 0) { - std::locale locale; for (size_t k = 0; k < precTypeUpper.size (); ++k) { - precTypeUpper[k] = std::toupper (precTypeUpper[k], locale); + precTypeUpper[k] = ::toupper(precTypeUpper[k]); } } return precTypeUpper; diff --git a/packages/ifpack2/test/belos/build_problem.hpp b/packages/ifpack2/test/belos/build_problem.hpp index 085f527449bf..6654040415ef 100644 --- a/packages/ifpack2/test/belos/build_problem.hpp +++ b/packages/ifpack2/test/belos/build_problem.hpp @@ -206,8 +206,10 @@ build_problem (Teuchos::ParameterList& test_params, // new matrix. RCP A_constGraph (new crs_matrix_type (A->getCrsGraph ())); // Copy the values row by row from A into A_constGraph. - ArrayView ind; - ArrayView val; + using lids_type = typename crs_matrix_type::local_inds_host_view_type; + using vals_type = typename crs_matrix_type::values_host_view_type; + lids_type ind; + vals_type val; const LO numLocalRows = static_cast (A->getNodeNumRows ()); for (LO localRow = 0; localRow < numLocalRows; ++localRow) { A->getLocalRowView (localRow, ind, val); diff --git a/packages/ifpack2/test/belos/tpetra_native.cpp b/packages/ifpack2/test/belos/tpetra_native.cpp index 0be088d9732b..8fe9a760b394 100644 --- a/packages/ifpack2/test/belos/tpetra_native.cpp +++ b/packages/ifpack2/test/belos/tpetra_native.cpp @@ -32,8 +32,8 @@ deepCopyFillCompleteCrsMatrix (const Tpetra::CrsMatrix& A) (! A.isFillComplete (), std::invalid_argument, "deepCopyFillCompleteCrsMatrix: Input matrix A must be fillComplete."); RCP A_copy (new crs_matrix_type (A.getCrsGraph ())); - auto A_copy_lcl = A_copy->getLocalMatrix (); - auto A_lcl = A.getLocalMatrix (); + auto A_copy_lcl = A_copy->getLocalMatrixDevice (); + auto A_lcl = A.getLocalMatrixDevice (); Kokkos::deep_copy (A_copy_lcl.values, A_lcl.values); A_copy->fillComplete (A.getDomainMap (), A.getRangeMap ()); return A_copy; diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockCrsUtil.hpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockCrsUtil.hpp index feee75b31aad..fb96744ea6ac 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockCrsUtil.hpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockCrsUtil.hpp @@ -244,9 +244,9 @@ struct BlockCrsMatrixMaker { // of local column abs sums. static void make_row_and_col_diag_dominant (Tpetra_BlockCrsMatrix& a) { const auto& g = a.getCrsGraph(); - const auto& rowptr = g.getLocalGraph().row_map; - const auto& colidx = g.getLocalGraph().entries; - const auto& values = a.getValuesHost(); + const auto& rowptr = g.getLocalGraphHost().row_map; + const auto& colidx = g.getLocalGraphHost().entries; + const auto& values = a.getValuesHostNonConst(); const auto row_map = g.getRowMap(); const auto col_map = g.getColMap(); @@ -385,10 +385,10 @@ struct BlockCrsMatrixMaker { } } - typename Tpetra_CrsGraph::local_graph_type g; + typename Tpetra_CrsGraph::local_graph_device_type g; { - typedef typename Tpetra_CrsGraph::local_graph_type::row_map_type row_map_type; - typedef typename Tpetra_CrsGraph::local_graph_type::entries_type entries_type; + typedef typename Tpetra_CrsGraph::local_graph_device_type::row_map_type row_map_type; + typedef typename Tpetra_CrsGraph::local_graph_device_type::entries_type entries_type; const GO nr = my_row_gids.size(); typename row_map_type::non_const_type::HostMirror rowptr("rowptr", nr + 1); typename entries_type::HostMirror colidx; @@ -440,7 +440,7 @@ struct BlockCrsMatrixMaker { Kokkos::deep_copy(row_map_tmp, rowptr); entries_type entries("entries", colidx.size()); Kokkos::deep_copy(entries, colidx); - g = typename Tpetra_CrsGraph::local_graph_type(entries, row_map_tmp); + g = typename Tpetra_CrsGraph::local_graph_device_type(entries, row_map_tmp); } if ( ! tridiags_only) { @@ -490,8 +490,8 @@ struct BlockCrsMatrixMaker { get_offdiag_idxs (const StructuredBlock& sb, const Tpetra_CrsGraph& g, const Tpetra_Map& col_map, const Int& lr, const Int& I, const Int& J, const Int& K, Int offdiag_idxs[2]) { offdiag_idxs[0] = offdiag_idxs[1] = -1; - const auto& rowptr = g.getLocalGraph().row_map; - const auto& colidx = g.getLocalGraph().entries; + const auto& rowptr = g.getLocalGraphHost().row_map; + const auto& colidx = g.getLocalGraphHost().entries; GO rid_offdiags[2]; rid_offdiags[0] = rid_offdiags[1] = Teuchos::OrdinalTraits::invalid(); if (K > 0) rid_offdiags[0] = sb.ijk2id(I, J, K-1); @@ -529,8 +529,8 @@ struct BlockCrsMatrixMaker { // Raw pointers for threading. auto m = mr.get(); auto g = gr.get(); - const auto& rowptr = g->getLocalGraph().row_map; - const auto& colidx = g->getLocalGraph().entries; + const auto& rowptr = g->getLocalGraphHost().row_map; + const auto& colidx = g->getLocalGraphHost().entries; const LO nr = rowptr.extent_int(0) - 1; const auto row_map = g->getRowMap().get(); const auto col_map = g->getColMap().get(); @@ -596,7 +596,7 @@ struct BlockCrsMatrixMaker { if (tridiag_is_identity || block_diag) zero_offdiag_idxs(offdiag_idxs, blockrow); for (size_t j = rowptr(lr); j < rowptr(lr+1); ++j) { - auto block = m->getLocalBlock(lr, colidx(j)); + auto block = m->getLocalBlockHostNonConst(lr, colidx(j)); const auto b = j - rowptr(lr); for (Int bi = 0; bi < bs; ++bi) for (Int bj = 0; bj < bs; ++bj) diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockRelaxation.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockRelaxation.cpp index 1b9689290d30..6c2f944c3937 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockRelaxation.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockRelaxation.cpp @@ -122,14 +122,16 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, Test0, Scalar, LocalOr prec.applyMat(x, y); - Teuchos::ArrayRCP yview = y.get1dView(); - //Since crsmatrix is a diagonal matrix with 2 on the diagonal, //y should be full of 2's now. Teuchos::ArrayRCP twos(num_rows_per_proc*2, 2); - TEST_COMPARE_FLOATING_ARRAYS(yview, twos(), Teuchos::ScalarTraits::eps()); + { + // Restrict scope of host access + Teuchos::ArrayRCP yview = y.get1dView(); + TEST_COMPARE_FLOATING_ARRAYS(yview, twos(), Teuchos::ScalarTraits::eps()); + } prec.apply(x, y); @@ -137,7 +139,11 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, Test0, Scalar, LocalOr Teuchos::ArrayRCP halfs(num_rows_per_proc*2, 0.5); - TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), Teuchos::ScalarTraits::eps()); + { + // Restrict scope of host access + Teuchos::ArrayRCP yview = y.get1dView(); + TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), Teuchos::ScalarTraits::eps()); + } } // Test apply() with x == y. @@ -744,16 +750,20 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, TestBlockContainers, S //the (block) graph should be diagonal auto crsgraph = tif_utest::create_banded_graph(num_rows_per_proc, 2); - auto bcrsmatrix = Teuchos::rcp(new - Tpetra::BlockCrsMatrix(*crsgraph, blockSize)); + using block_crs_matrix_type = Tpetra::BlockCrsMatrix; + using h_inds = typename block_crs_matrix_type::local_inds_host_view_type; + using h_vals = typename block_crs_matrix_type::nonconst_values_host_view_type; + + auto bcrsmatrix = Teuchos::rcp(new block_crs_matrix_type(*crsgraph, blockSize)); + //Fill in values of the the matrix for(LO l_row = 0; (size_t) l_row < bcrsmatrix->getNodeNumRows(); ++l_row) { - const LO * inds; - Scalar * vals; - LO numInd; - bcrsmatrix->getLocalRowView(l_row, inds, vals, numInd); + h_inds inds; + h_vals vals; + bcrsmatrix->getLocalRowViewNonConst(l_row, inds, vals); + LO numInd = (LO) inds.size(); for(int k = 0; k < blockSize * blockSize * numInd; k++) vals[k] = 0; for (LO j = 0; j < numInd; ++j) @@ -895,16 +905,18 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, TestBlockContainersDec //the (block) graph should be diagonal auto crsgraph = tif_utest::create_banded_graph(num_rows_per_proc, 1); - auto bcrsmatrix = Teuchos::rcp(new - Tpetra::BlockCrsMatrix(*crsgraph, blockSize)); + using block_crs_matrix_type = Tpetra::BlockCrsMatrix; + using h_inds = typename block_crs_matrix_type::local_inds_host_view_type; + using h_vals = typename block_crs_matrix_type::nonconst_values_host_view_type; + auto bcrsmatrix = Teuchos::rcp(new block_crs_matrix_type(*crsgraph, blockSize)); //Fill in values of the the matrix for(LO l_row = 0; (size_t) l_row < bcrsmatrix->getNodeNumRows(); ++l_row) { - const LO * inds; - Scalar * vals; - LO numInd; - bcrsmatrix->getLocalRowView(l_row, inds, vals, numInd); + h_inds inds; + h_vals vals; + bcrsmatrix->getLocalRowViewNonConst(l_row, inds, vals); + LO numInd = (LO)inds.size(); for (LO j = 0; j < numInd; ++j) { const LO lcl_col = inds[j]; diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp index ef7573dea2cb..912e8e5e5baa 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp @@ -79,7 +79,7 @@ struct BlockTriDiContainerTester { const auto col_map = g.getColMap(); const auto gid = row_map->getGlobalElement(row_lid_to_match); const auto col_lid = col_map->getLocalElement(gid); - auto block = A.getLocalBlock(row_lid, col_lid); + auto block = A.getLocalBlockHostNonConst(row_lid, col_lid); const Int bs = block.extent(1); for (Int bi = 0; bi < bs; ++bi) for (Int bj = 0; bj < bs; ++bj) diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFactory.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFactory.cpp index 5c400c75db79..b1b1c10af499 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFactory.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFactory.cpp @@ -61,6 +61,8 @@ for preconditioners it produces. #include #include +#include + namespace { using Tpetra::global_size_t; typedef tif_utest::Node Node; @@ -187,7 +189,28 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Factory, BlockCrs, Scalar, LocalOrdinal check_precond_basics(prec_relax, out, success); check_precond_apply(prec_relax, out, success); - // NOTE: As we expand support for the BlockCrsMatrix to other smoother types besides RELAXATION, tests should be added here. + // Basic block relaxation tests + prec_relax = factory.create ("BLOCKRELAXATION", rowmatrix); + TEST_EQUALITY(prec_relax != Teuchos::null, true); + check_precond_basics(prec_relax, out, success); + check_precond_apply(prec_relax, out, success); + + // Block-Tridiagonal + { + Teuchos::ParameterList params; + params.set("relaxation: container", "BlockTriDi"); + params.set("relaxation: type", "MT Split Jacobi"); + params.set("partitioner: type", "linear"); + params.set("partitioner: local parts", num_rows_per_proc); + + prec_relax = factory.create ("BLOCKRELAXATION", rowmatrix); + TEST_EQUALITY(prec_relax != Teuchos::null, true); + prec_relax->setParameters(params); + check_precond_basics(prec_relax, out, success); + check_precond_apply(prec_relax, out, success); + } + + } diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFiltering.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFiltering.cpp index f3e67a00902e..aedd3f28e200 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFiltering.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFiltering.cpp @@ -128,18 +128,22 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Filtering, Test0, Scalar, LocalOrdinal, // Apply w/ GetRow size_t max_nz_per_row=LocalA.getNodeMaxNumRowEntries(); - Teuchos::Array Indices(max_nz_per_row); - Teuchos::Array Values(max_nz_per_row); - Teuchos::ArrayRCP xview=lx.get1dView(); - - for(LocalOrdinal i=0; i < (LocalOrdinal)num_rows_per_proc; i++){ - size_t NumEntries; - LocalA.getLocalRowCopy(i,Indices(),Values(),NumEntries); - Scalar sum=0; - for(LocalOrdinal j=0; (size_t) j < NumEntries; j++){ - sum+=Values[j] * xview[Indices[j]]; + using lids_type = typename Tpetra::CrsMatrix::nonconst_local_inds_host_view_type; + using vals_type = typename Tpetra::CrsMatrix::nonconst_values_host_view_type; + lids_type Indices("Indices",max_nz_per_row); + vals_type Values("Values",max_nz_per_row); + {// Host view needs to be scoped + Teuchos::ArrayRCP xview=lx.get1dView(); + + for(LocalOrdinal i=0; i < (LocalOrdinal)num_rows_per_proc; i++){ + size_t NumEntries; + LocalA.getLocalRowCopy(i,Indices,Values,NumEntries); + Scalar sum=0; + for(LocalOrdinal j=0; (size_t) j < NumEntries; j++){ + sum+=Values[j] * xview[Indices[j]]; + } + lz.replaceLocalValue(i,sum); } - lz.replaceLocalValue(i,sum); } // Diff diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestHelpers.hpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestHelpers.hpp index 9c1a71e13f40..ba437e387d3d 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestHelpers.hpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestHelpers.hpp @@ -682,8 +682,11 @@ template Teuchos::RCP > create_banded_block_matrix(const Teuchos::RCP >& graph, const int blockSize, const size_t rbandwidth) { - Teuchos::RCP > bcrsmatrix - = Teuchos::rcp(new Tpetra::BlockCrsMatrix(*graph, blockSize)); + using block_crs_matrix_type = Tpetra::BlockCrsMatrix; + using h_inds = typename block_crs_matrix_type::local_inds_host_view_type; + using h_vals = typename block_crs_matrix_type::nonconst_values_host_view_type; + + Teuchos::RCP bcrsmatrix = Teuchos::rcp(new block_crs_matrix_type(*graph, blockSize)); const Tpetra::Map& meshRowMap = *bcrsmatrix->getRowMap(); const int blockMatSize = blockSize*blockSize; @@ -699,11 +702,10 @@ Teuchos::RCPgetLocalRowView(l_row, inds, vals, numInd); + h_inds inds; + h_vals vals; + bcrsmatrix->getLocalRowViewNonConst(l_row, inds, vals); + LocalOrdinal numInd = (LocalOrdinal)inds.size(); for (LocalOrdinal j = 0; j < numInd; ++j) { const LocalOrdinal lcl_col = inds[j]; @@ -954,6 +956,16 @@ Teuchos::RCP > c class NotCrsMatrix : public Ifpack2::Details::RowMatrix > { public: + typedef typename Tpetra::RowMatrix MatrixType; + typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type; + typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type; + typedef typename MatrixType::values_host_view_type values_host_view_type; + + typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type; + + NotCrsMatrix (Teuchos::RCP >& A) : A_(A){;} virtual ~NotCrsMatrix(){;} virtual Teuchos::RCP > getComm() const {return A_->getComm();} @@ -984,26 +996,50 @@ Teuchos::RCP > c virtual bool isFillComplete() const {return A_->isFillComplete();} virtual bool supportsRowViews() const {return A_->supportsRowViews();} + virtual void + getGlobalRowCopy (GlobalOrdinal GlobalRow, + nonconst_global_inds_host_view_type &indices, + nonconst_values_host_view_type &values,size_t &NumEntries) const {A_->getGlobalRowCopy(GlobalRow,indices,values,NumEntries);} +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowCopy (GlobalOrdinal GlobalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const {A_->getGlobalRowCopy(GlobalRow,Indices,Values,NumEntries);} +#endif + virtual void + getLocalRowCopy (LocalOrdinal LocalRow, + nonconst_local_inds_host_view_type & indices, + nonconst_values_host_view_type & values,size_t &NumEntries) const {A_->getLocalRowCopy(LocalRow,indices,values,NumEntries);} +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowCopy (LocalOrdinal LocalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, size_t &NumEntries) const {A_->getLocalRowCopy(LocalRow,Indices,Values,NumEntries);} +#endif + + virtual void + getGlobalRowView (GlobalOrdinal GlobalRow, + global_inds_host_view_type &indices, + values_host_view_type &values) const {A_->getGlobalRowView(GlobalRow,indices,values);} +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getGlobalRowView (GlobalOrdinal GlobalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const {A_->getGlobalRowView(GlobalRow,indices,values);} - +#endif + virtual void + getLocalRowView (LocalOrdinal LocalRow, + local_inds_host_view_type & indices, + values_host_view_type & values) const {A_->getLocalRowView(LocalRow,indices,values);} +#ifdef TPETRA_ENABLE_DEPRECATED_CODE virtual void getLocalRowView (LocalOrdinal LocalRow, Teuchos::ArrayView &indices, Teuchos::ArrayView &values) const {A_->getLocalRowView(LocalRow,indices,values);} +#endif virtual void getLocalDiagCopy (Tpetra::Vector &diag) const {A_->getLocalDiagCopy(diag);} virtual void leftScale (const Tpetra::Vector& x) {A_->leftScale(x);} diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestIlukGraph.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestIlukGraph.cpp index 69859e9c6a57..9195659ffc11 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestIlukGraph.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestIlukGraph.cpp @@ -72,7 +72,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_2_DECL(Ifpack2IlukGraph, IlukGraphTest0, LocalOrdinal //Teuchos::FancyOStream& out, bool& success typedef Tpetra::CrsGraph crs_graph_type; - typedef typename crs_graph_type::local_graph_type local_graph_type; + typedef typename crs_graph_type::local_graph_device_type local_graph_type; typedef typename local_graph_type::row_map_type lno_row_view_t; typedef typename local_graph_type::entries_type lno_nonzero_view_t; typedef typename local_graph_type::device_type::memory_space TemporaryMemorySpace; diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp index 82a8e1edc081..4c4e6d0945c9 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp @@ -124,7 +124,7 @@ localSolve (Tpetra::MultiVector< (mode == Teuchos::TRANS ? "T" : "N"); const std::string diag = implicitUnitDiag ? "U" : "N"; - auto A_lcl = A.getLocalMatrix (); + auto A_lcl = A.getLocalMatrixHost (); if (X.isConstantStride () && Y.isConstantStride ()) { auto X_lcl = X.getLocalViewHost (Tpetra::Access::OverwriteAll); @@ -207,6 +207,7 @@ void testCompareToLocalSolve (bool& success, Teuchos::FancyOStream& out, typedef GlobalOrdinal GO; typedef Tpetra::Map map_type; typedef typename map_type::device_type device_type; + typedef Tpetra::CrsGraph crs_graph_type; typedef Tpetra::CrsMatrix crs_matrix_type; typedef Tpetra::RowMatrix row_matrix_type; typedef Tpetra::MultiVector mv_type; @@ -363,10 +364,10 @@ void testCompareToLocalSolve (bool& success, Teuchos::FancyOStream& out, // (it shouldn't). RCP A_copy; { - typedef typename crs_matrix_type::local_matrix_type local_matrix_type; - typedef typename crs_matrix_type::local_graph_type local_graph_type; + typedef typename crs_matrix_type::local_matrix_device_type local_matrix_type; + typedef typename crs_graph_type::local_graph_device_type local_graph_type; - local_matrix_type A_lcl = A->getLocalMatrix (); + local_matrix_type A_lcl = A->getLocalMatrixDevice (); typename local_matrix_type::row_map_type::non_const_type ptr ("A_copy.ptr", A_lcl.graph.row_map.extent (0)); Kokkos::deep_copy (ptr, A_lcl.graph.row_map); @@ -763,64 +764,41 @@ testArrowMatrixWithDense (bool& success, Teuchos::FancyOStream& out, const LO lc TEST_EQUALITY( c(lclNumRows-1), c_n_expected ); } -template::scalar_type, - class LO = Tpetra::Vector<>::local_ordinal_type, - class GO = Tpetra::Vector<>::global_ordinal_type> -void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) + +template +bool +testArrowMatrixAssembly(const int lclNumRows, + const bool explicitlyStoreUnitDiagonalOfL, + RCP rowMap, + RCP colMap, + RCP domMap, + RCP ranMap, + RCP & L, + RCP & U, + Teuchos::FancyOStream& out) { - typedef Tpetra::Map map_type; - typedef typename map_type::device_type device_type; - typedef Tpetra::CrsMatrix crs_matrix_type; - typedef Tpetra::RowMatrix row_matrix_type; - typedef Tpetra::Vector vec_type; - typedef Ifpack2::LocalSparseTriangularSolver solver_type; + int gblSuccess=1, lclSuccess=1; + bool success=true; + using LO = typename crs_matrix_type::local_ordinal_type; + using SC = typename crs_matrix_type::scalar_type; + typedef Kokkos::Details::ArithTraits KAT; typedef typename KAT::val_type IST; typedef typename KAT::mag_type mag_type; - int lclSuccess = 1; - int gblSuccess = 1; - - const bool explicitlyStoreUnitDiagonalOfL = false; - - Teuchos::OSTab tab0 (out); - out << "Ifpack2::LocalSparseTriangularSolver: Test with arrow matrix" << endl; - Teuchos::OSTab tab1 (out); - - auto comm = Tpetra::getDefaultComm (); - - const LO lclNumRows = 8; // power of two (see above) - const LO lclNumCols = lclNumRows; - const GO gblNumRows = comm->getSize () * lclNumRows; - const GO indexBase = 0; - RCP rowMap = - rcp (new map_type (static_cast (gblNumRows), - static_cast (lclNumRows), - indexBase, comm)); - - // At this point, we know Kokkos has been initialized, so test the - // dense version of the problem. - testArrowMatrixWithDense (success, out, lclNumRows); - - // If we construct an upper or lower triangular matrix with an - // implicit unit diagonal, then we need to specify the column Map - // explicitly. Otherwise, the matrix will report having the wrong - // number of columns. In this case, the local matrix is square and - // every column is populated, so we can set column Map = row Map. - RCP colMap = rowMap; - RCP domMap = rowMap; - RCP ranMap = rowMap; - - typedef typename crs_matrix_type::local_graph_type local_graph_type; - typedef typename crs_matrix_type::local_matrix_type local_matrix_type; + typedef typename crs_matrix_type::local_graph_device_type local_graph_type; + typedef typename crs_matrix_type::local_matrix_device_type local_matrix_type; typedef typename local_matrix_type::row_map_type::non_const_type row_offsets_type; typedef typename local_graph_type::entries_type::non_const_type col_inds_type; typedef typename local_matrix_type::values_type::non_const_type values_type; + const LO lclNumCols = lclNumRows; + + auto comm = rowMap->getComm(); + // // The suffix _d here stands for (GPU) "device," and the suffix _h - // stands for (CPU) "host." + // stands for (CPU) "host." // - row_offsets_type L_ptr_d ("ptr", lclNumRows + 1); auto L_ptr_h = Kokkos::create_mirror_view (L_ptr_d); row_offsets_type U_ptr_d ("ptr", lclNumRows + 1); @@ -906,7 +884,7 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) TEST_EQUALITY( gblSuccess, 1 ); if (! gblSuccess) { out << "Aborting test" << endl; - return; + return gblSuccess; } Kokkos::deep_copy (L_ptr_d, L_ptr_h); @@ -918,7 +896,6 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) Kokkos::deep_copy (U_val_d, U_val_h); out << "Create the lower triangular Tpetra::CrsMatrix L" << endl; - RCP L; TEST_NOTHROW( L = rcp (new crs_matrix_type (rowMap, colMap, L_ptr_d, L_ind_d, L_val_d)) ); TEST_ASSERT( ! L.is_null () ); lclSuccess = success ? 1 : 0; @@ -927,7 +904,7 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) TEST_EQUALITY( gblSuccess, 1 ); if (! gblSuccess) { out << "Aborting test" << endl; - return; + return gblSuccess; } out << "Call fillComplete on the lower triangular matrix L" << endl; TEST_NOTHROW( L->fillComplete (domMap, ranMap) ); @@ -937,17 +914,114 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) TEST_EQUALITY( gblSuccess, 1 ); if (! gblSuccess) { out << "Aborting test" << endl; - return; + return gblSuccess; } + out << "Create the upper triangular Tpetra::CrsMatrix U" << endl; + TEST_NOTHROW( U = rcp (new crs_matrix_type (rowMap, colMap, U_ptr_d, U_ind_d, U_val_d)) ); + TEST_ASSERT( ! U.is_null () ); + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // output argument + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (! gblSuccess) { + out << "Aborting test" << endl; + return gblSuccess; + } + out << "Call fillComplete on the upper triangular matrix U" << endl; + TEST_NOTHROW( U->fillComplete (domMap, ranMap) ); + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // output argument + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (! gblSuccess) { + out << "Aborting test" << endl; + return gblSuccess; + } + return gblSuccess; +} + + +template::scalar_type, + class LO = Tpetra::Vector<>::local_ordinal_type, + class GO = Tpetra::Vector<>::global_ordinal_type> +void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) +{ + typedef Tpetra::Map map_type; + typedef typename map_type::device_type device_type; + typedef Tpetra::CrsMatrix crs_matrix_type; + typedef Tpetra::RowMatrix row_matrix_type; + typedef Tpetra::Vector vec_type; + typedef Ifpack2::LocalSparseTriangularSolver solver_type; + typedef Kokkos::Details::ArithTraits KAT; + typedef typename KAT::val_type IST; + typedef typename KAT::mag_type mag_type; + int lclSuccess = 1; + int gblSuccess = 1; + + const bool explicitlyStoreUnitDiagonalOfL = false; + + Teuchos::OSTab tab0 (out); + out << "Ifpack2::LocalSparseTriangularSolver: Test with arrow matrix" << endl; + Teuchos::OSTab tab1 (out); + + auto comm = Tpetra::getDefaultComm (); + + const LO lclNumRows = 8; // power of two (see above) + const LO lclNumCols = lclNumRows; + const GO gblNumRows = comm->getSize () * lclNumRows; + const GO indexBase = 0; + RCP rowMap = + rcp (new map_type (static_cast (gblNumRows), + static_cast (lclNumRows), + indexBase, comm)); + + // At this point, we know Kokkos has been initialized, so test the + // dense version of the problem. + testArrowMatrixWithDense (success, out, lclNumRows); + + // If we construct an upper or lower triangular matrix with an + // implicit unit diagonal, then we need to specify the column Map + // explicitly. Otherwise, the matrix will report having the wrong + // number of columns. In this case, the local matrix is square and + // every column is populated, so we can set column Map = row Map. + RCP colMap = rowMap; + RCP domMap = rowMap; + RCP ranMap = rowMap; + + // All of the matrix assembly stuff had to get hived off into a different + // scope to keep the later accessors from violating the "you can't have a + // host and a device view at the same time" assumption + RCP L, U; + + gblSuccess=testArrowMatrixAssembly(lclNumRows, + explicitlyStoreUnitDiagonalOfL, + rowMap,colMap,domMap,ranMap, + L,U,out); + if(!gblSuccess) return; + + typedef typename crs_matrix_type::local_graph_device_type local_graph_type; + typedef typename crs_matrix_type::local_matrix_device_type local_matrix_type; + typedef typename local_matrix_type::row_map_type::non_const_type row_offsets_type; + typedef typename local_graph_type::entries_type::non_const_type col_inds_type; + typedef typename local_matrix_type::values_type::non_const_type values_type; + + typedef typename crs_matrix_type::local_inds_host_view_type const_local_inds_type; + typedef typename crs_matrix_type::values_host_view_type const_values_type; + + const IST ONE = KAT::one (); + const IST TWO = KAT::one () + KAT::one (); + // Don't cast directly from an integer type to IST, + // since if IST is complex, that cast may not exist. + const IST N = static_cast (static_cast (lclNumRows)); + const IST d = TWO * N; + out << "Make sure that the last row of L is correct" << endl; { Teuchos::OSTab tab2 (out); - // FIXME (mfh 23 Aug 2016) This may depend on UVM. - // We should instead rely on dual view semantics here. - Teuchos::ArrayView lclColInds; - Teuchos::ArrayView vals; + const_local_inds_type lclColInds; + const_values_type vals; L->getLocalRowView (lclNumRows - 1, lclColInds, vals); if (explicitlyStoreUnitDiagonalOfL) { @@ -983,28 +1057,6 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) return; } - out << "Create the upper triangular Tpetra::CrsMatrix U" << endl; - RCP U; - TEST_NOTHROW( U = rcp (new crs_matrix_type (rowMap, colMap, U_ptr_d, U_ind_d, U_val_d)) ); - TEST_ASSERT( ! U.is_null () ); - lclSuccess = success ? 1 : 0; - gblSuccess = 0; // output argument - reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); - TEST_EQUALITY( gblSuccess, 1 ); - if (! gblSuccess) { - out << "Aborting test" << endl; - return; - } - out << "Call fillComplete on the upper triangular matrix U" << endl; - TEST_NOTHROW( U->fillComplete (domMap, ranMap) ); - lclSuccess = success ? 1 : 0; - gblSuccess = 0; // output argument - reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); - TEST_EQUALITY( gblSuccess, 1 ); - if (! gblSuccess) { - out << "Aborting test" << endl; - return; - } out << "Create the solver for L" << endl; RCP L_solver; diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver2.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver2.cpp index 061e66a121db..4434a9c6d3ee 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver2.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver2.cpp @@ -18,7 +18,7 @@ namespace { const crs_graph_type& G_crs = dynamic_cast (G); - auto G_lcl = G_crs.getLocalGraph (); + auto G_lcl = G_crs.getLocalGraphDevice (); auto lclRowMap = G.getRowMap ()->getLocalMap (); auto lclColMap = G.getColMap ()->getLocalMap (); return determineLocalTriangularStructure (G_lcl, lclRowMap, lclColMap, true); diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp index cb136bd10e61..954d5acc0bc0 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp @@ -228,8 +228,8 @@ void reducedMatvec(const OverlappedMatrixClass & A, if(overlapLevel >= (int) hstarts.size()) throw std::runtime_error("reducedMatvec: Exceeded available overlap"); - auto undA_lcl = undA->getLocalMatrix (); - auto extA_lcl = extA->getLocalMatrix (); + auto undA_lcl = undA->getLocalMatrixDevice (); + auto extA_lcl = extA->getLocalMatrixDevice (); auto X_lcl = X.getLocalViewDevice (Tpetra::Access::ReadOnly); auto Y_lcl = Y.getLocalViewDevice (Tpetra::Access::OverwriteAll); diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp index b7458a923f8f..61b7a7e77ade 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp @@ -392,8 +392,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, BandedBlockCrsMatrixWithDropping, Scal TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, BlockMatrixOps, Scalar, LocalOrdinal, GlobalOrdinal) { - typedef Kokkos::View > little_block_type; - typedef Kokkos::View > little_vec_type; + typedef Kokkos::View > little_block_type; + typedef Kokkos::View > little_vec_type; typedef typename Kokkos::Details::ArithTraits::val_type impl_scalar_type; typedef Teuchos::ScalarTraits STS; diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRelaxation.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRelaxation.cpp index 3d383702d1cf..2469df5a8980 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRelaxation.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRelaxation.cpp @@ -1133,6 +1133,64 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, MTSGS, Scalar, LocalOrdinal } } +TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, MTSGS_LongRows, Scalar, LocalOrdinal, GlobalOrdinal) +{ + using Teuchos::RCP; + using Teuchos::rcp; + using Teuchos::ParameterList; + using crs_matrix_type = Tpetra::CrsMatrix; + using row_matrix_type = Tpetra::RowMatrix; + using MV = Tpetra::MultiVector; + using map_type = Tpetra::Map; + using prec_type = Ifpack2::Relaxation; + using STS = Teuchos::ScalarTraits; + using STM = typename STS::magnitudeType; + std::string version = Ifpack2::Version(); + out << "Ifpack2::Version(): " << version << std::endl; + //Generate banded test matrix + RCP rowmap = tif_utest::create_tpetra_map(100); + RCP A = tif_utest::create_banded_matrix(rowmap, 3); + RCP prec = rcp(new prec_type(A)); + ParameterList goodParams; + goodParams.set("relaxation: type", "MT Symmetric Gauss-Seidel"); + goodParams.set("relaxation: sweeps", 3); + goodParams.set("relaxation: long row threshold", 3); + //Try setting up precondition with incompatible type, and make sure this throws. + { + ParameterList badParams = goodParams; + badParams.set("relaxation: type", "Gauss-Seidel"); + TEST_THROW (prec->setParameters (badParams), std::invalid_argument); + } + //Try setting up cluster GS preconditioner with long row algorithm enabled - should also throw. + { + ParameterList badParams = goodParams; + badParams.set("relaxation: mtgs cluster size", 4); + TEST_THROW(prec->setParameters (badParams), std::invalid_argument); + } + prec->setParameters (goodParams); + prec->initialize(); + prec->compute(); + //Set up linear problem + const int numVecs = 10; + MV x(A->getDomainMap(), numVecs, true); + MV b(rowmap, numVecs, false); + b.randomize(); + Kokkos::View initNorms("Initial norms", numVecs); + //Residual norms for starting solution of zero + b.norm2(initNorms); + prec->apply(b, x); + //Compute residual vector = b - Ax + MV residual(b, Teuchos::Copy); + A->apply(x, residual, Teuchos::NO_TRANS, -STS::one(), STS::one()); + Kokkos::View resNorms("Residual norms", numVecs); + residual.norm2(resNorms); + //Make sure all residual norms are significantly smaller than initial + for(int i = 0; i < numVecs; i++) + { + TEST_COMPARE(resNorms(i), <, 0.5 * initNorms(i)); + } +} + TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, ClusterMTSGS, Scalar, LocalOrdinal, GlobalOrdinal) { using Teuchos::RCP; @@ -1193,6 +1251,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, ClusterMTSGS, Scalar, Local TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2Relaxation, TestLowerTriangularBlockCrsMatrix, Scalar, LO, GO ) \ TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2Relaxation, TestUpperTriangularBlockCrsMatrix, Scalar, LO, GO ) \ TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2Relaxation, MTSGS, Scalar, LO, GO ) \ + TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2Relaxation, MTSGS_LongRows, Scalar, LO, GO ) \ TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2Relaxation, ClusterMTSGS, Scalar, LO, GO ) //TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2Relaxation, SGS_mult_sweeps, Scalar, LO, GO ) diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestSingleProcessRILUK.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestSingleProcessRILUK.cpp index d9ac6214f887..404d63f0adb9 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestSingleProcessRILUK.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestSingleProcessRILUK.cpp @@ -73,7 +73,7 @@ template void remove_diags_and_scale(const MatrixType& L, const MatrixType& U, Teuchos::RCP& Ln, Teuchos::RCP& Un, Teuchos::RCP& Dn) { - typedef typename MatrixType::local_matrix_type local_matrix_type; + typedef typename MatrixType::local_matrix_device_type local_matrix_type; typedef typename std::remove_const::type size_type; typedef typename std::remove_const::type ordinal_type; typedef typename std::remove_const::type value_type; @@ -87,11 +87,11 @@ void remove_diags_and_scale(const MatrixType& L, const MatrixType& U, typedef Kokkos::TeamPolicy team_policy; typedef typename Kokkos::TeamPolicy::member_type member_type; - auto L_rowmap = L.getLocalMatrix().graph.row_map; - auto L_entries = L.getLocalMatrix().graph.entries; + auto L_rowmap = L.getLocalMatrixDevice().graph.row_map; + auto L_entries = L.getLocalMatrixDevice().graph.entries; auto L_values = L.getLocalValuesView(); - auto U_rowmap = U.getLocalMatrix().graph.row_map; - auto U_entries = U.getLocalMatrix().graph.entries; + auto U_rowmap = U.getLocalMatrixDevice().graph.row_map; + auto U_entries = U.getLocalMatrixDevice().graph.entries; auto U_values = U.getLocalValuesView(); rowmap_type Ln_rowmap ("Ln_rowmap", L_rowmap.extent(0)); @@ -290,6 +290,23 @@ void Ifpack2RILUKSingleProcess_test1 (bool& success, Teuchos::FancyOStream& out, RCP rowmap = tif_utest::create_tpetra_map (num_rows_per_proc); + // Matrix + // [ 2 .1 0 0 0] + // [.1 2 0 0 0] + // [ 0 .1 2 .1 0] + // [ 0 0 .1 2 .1] + // [ 0 0 0 .1 2] + + // Matlab's Factors + // L + // Diagonal = 1 (implied) + // Subdiagonal (approx) = .05, .0501, .0501 .0501 + + // U + // Diagonal (approx) = 2 1.995 1.995 1.995 1.995 + // Superdiagonal (approx) = .1 .1 .1 .1 + + if (rowmap->getComm ()->getSize () > 1) { out << "This test may only be run in serial " "or with a single MPI process." << endl; @@ -300,6 +317,12 @@ void Ifpack2RILUKSingleProcess_test1 (bool& success, Teuchos::FancyOStream& out, RCP crsmatrix = tif_utest::create_test_matrix2(rowmap); + {//CMS + auto out = Teuchos::getFancyOStream (Teuchos::rcpFromRef (std::cout)); + *out<<"***** A *****"<describe(*out,Teuchos::VERB_EXTREME); + } + //----------------Default trisolver----------------// { out << "Creating preconditioner" << endl; @@ -316,7 +339,18 @@ void Ifpack2RILUKSingleProcess_test1 (bool& success, Teuchos::FancyOStream& out, out << "Calling initialize() and compute()" << endl; prec.initialize(); prec.compute(); - + + {//CMS + auto out = Teuchos::getFancyOStream (Teuchos::rcpFromRef (std::cout)); + *out<<"***** Test L *****"< permutedMatrix = Teuchos::rcp(new crs_matrix_type(permRowMap, 5)); - Teuchos::Array Inds(5); - Teuchos::Array pInds(5); - Teuchos::Array Vals(5); - Teuchos::Array pVals(5); + typename crs_matrix_type::nonconst_global_inds_host_view_type Inds("Inds",5), pInds("pInds",5); + typename crs_matrix_type::nonconst_values_host_view_type Vals("Vals",5), pVals("pVals",5); size_t numEntries; for (global_size_t i=0; igetGlobalRowCopy(i,Inds(),Vals(),numEntries); - pInds.resize(numEntries); - pVals.resize(numEntries); + crsmatrix->getGlobalRowCopy(i,Inds,Vals,numEntries); + Kokkos::resize(pInds,numEntries); + Kokkos::resize(pVals,numEntries); for (size_t j=0; jinsertGlobalValues(origToPerm[i],pInds(),pVals()); + permutedMatrix->insertGlobalValues(origToPerm[i],numEntries,pVals.data(),pInds.data()); } permutedMatrix->fillComplete(); diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_Basis.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_Basis.hpp index 13954baec214..7ca8e527efd9 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_Basis.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_Basis.hpp @@ -362,92 +362,7 @@ using HostBasisPtr = BasisPtr allocateOutputView( const int numPoints, const EOperator operatorType = OPERATOR_VALUE) const - { - const bool operatorSupported = (operatorType == OPERATOR_VALUE) || (operatorType == OPERATOR_GRAD) || (operatorType == OPERATOR_CURL) || (operatorType == OPERATOR_DIV); - INTREPID2_TEST_FOR_EXCEPTION(!operatorSupported, std::invalid_argument, "operator is not supported by allocateOutputView()"); - - const int numFields = this->getCardinality(); - const int spaceDim = basisCellTopology_.getDimension(); - - // KK: this needs to be updated after nate works on tensorthings - using OutputViewAllocatable = Kokkos::DynRankView; - - switch (functionSpace_) - { - case FUNCTION_SPACE_HGRAD: - if (operatorType == OPERATOR_VALUE) - { - // scalar-valued container - OutputViewAllocatable dataView("BasisValues HGRAD VALUE data", numFields, numPoints); - return dataView; - } - else if (operatorType == OPERATOR_GRAD) - { - OutputViewAllocatable dataView("BasisValues HGRAD GRAD data", numFields, numPoints, spaceDim); - return dataView; - } - else - { - INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()"); - } - case FUNCTION_SPACE_HDIV: - if (operatorType == OPERATOR_VALUE) - { - // vector-valued container - OutputViewAllocatable dataView("BasisValues HDIV VALUE data", numFields, numPoints, spaceDim); - return dataView; - } - else if (operatorType == OPERATOR_DIV) - { - // scalar-valued curl - OutputViewAllocatable dataView("BasisValues HDIV DIV data", numFields, numPoints); - return dataView; - } - else - { - INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()"); - } - case FUNCTION_SPACE_HCURL: - if (operatorType == OPERATOR_VALUE) - { - OutputViewAllocatable dataView("BasisValues HCURL VALUE data", numFields, numPoints, spaceDim); - return dataView; - } - else if (operatorType == OPERATOR_CURL) - { - if (spaceDim != 2) - { - // vector-valued curl - OutputViewAllocatable dataView("BasisValues HCURL CURL data", numFields, numPoints, spaceDim); - return dataView; - } - else - { - // scalar-valued curl - OutputViewAllocatable dataView("BasisValues HCURL CURL data (scalar)", numFields, numPoints); - return dataView; - } - } - else - { - INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()"); - } - case FUNCTION_SPACE_HVOL: - if (operatorType == OPERATOR_VALUE) - { - // vector-valued container - OutputViewAllocatable dataView("BasisValues HVOL VALUE data", numFields, numPoints); - return dataView; - } - else - { - INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()"); - } - default: - INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()"); - } - } + Kokkos::DynRankView allocateOutputView( const int numPoints, const EOperator operatorType = OPERATOR_VALUE) const; /** \brief Allocate BasisValues container suitable for passing to the getValues() variant that takes a TensorPoints container as argument. @@ -456,7 +371,8 @@ using HostBasisPtr = BasisPtr allocateBasisValues( TensorPoints points, const EOperator operatorType = OPERATOR_VALUE) const { - const bool operatorSupported = (operatorType == OPERATOR_VALUE) || (operatorType == OPERATOR_GRAD) || (operatorType == OPERATOR_CURL) || (operatorType == OPERATOR_DIV); + const bool operatorIsDk = (operatorType >= OPERATOR_D1) && (operatorType <= OPERATOR_D10); + const bool operatorSupported = (operatorType == OPERATOR_VALUE) || (operatorType == OPERATOR_GRAD) || (operatorType == OPERATOR_CURL) || (operatorType == OPERATOR_DIV) || operatorIsDk; INTREPID2_TEST_FOR_EXCEPTION(!operatorSupported, std::invalid_argument, "operator is not supported by allocateBasisValues"); // // this default implementation employs a trivial tensor-product structure; make sure that points also have a trivial tensor product structure: diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisDef.hpp index c87fe3a9e0d8..07f018e3a8f2 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisDef.hpp @@ -862,6 +862,108 @@ namespace Intrepid2 { ">>> ERROR: (Intrepid2::getValues_HGRAD_Args) dim 0 (number of basis functions) of outputValues must equal basis cardinality."); } + template + Kokkos::DynRankView + Basis::allocateOutputView( const int numPoints, const EOperator operatorType) const + { + const bool operatorIsDk = (operatorType >= OPERATOR_D1) && (operatorType <= OPERATOR_D10); + const bool operatorSupported = (operatorType == OPERATOR_VALUE) || (operatorType == OPERATOR_GRAD) || (operatorType == OPERATOR_CURL) || (operatorType == OPERATOR_DIV) || operatorIsDk; + INTREPID2_TEST_FOR_EXCEPTION(!operatorSupported, std::invalid_argument, "operator is not supported by allocateOutputView()"); + + const int numFields = this->getCardinality(); + const int spaceDim = basisCellTopology_.getDimension(); + + using OutputViewAllocatable = Kokkos::DynRankView; + + switch (functionSpace_) + { + case FUNCTION_SPACE_HGRAD: + if (operatorType == OPERATOR_VALUE) + { + // scalar-valued container + OutputViewAllocatable dataView("BasisValues HGRAD VALUE data", numFields, numPoints); + return dataView; + } + else if (operatorType == OPERATOR_GRAD) + { + OutputViewAllocatable dataView("BasisValues HGRAD GRAD data", numFields, numPoints, spaceDim); + return dataView; + } + else if (operatorIsDk) + { + ordinal_type dkCardinality = getDkCardinality(operatorType, spaceDim); + OutputViewAllocatable dataView("BasisValues HGRAD Dk data", numFields, numPoints, dkCardinality); + return dataView; + } + else + { + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()"); + } + case FUNCTION_SPACE_HDIV: + if (operatorType == OPERATOR_VALUE) + { + // vector-valued container + OutputViewAllocatable dataView("BasisValues HDIV VALUE data", numFields, numPoints, spaceDim); + return dataView; + } + else if (operatorType == OPERATOR_DIV) + { + // scalar-valued curl + OutputViewAllocatable dataView("BasisValues HDIV DIV data", numFields, numPoints); + return dataView; + } + else + { + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()"); + } + case FUNCTION_SPACE_HCURL: + if (operatorType == OPERATOR_VALUE) + { + OutputViewAllocatable dataView("BasisValues HCURL VALUE data", numFields, numPoints, spaceDim); + return dataView; + } + else if (operatorType == OPERATOR_CURL) + { + if (spaceDim != 2) + { + // vector-valued curl + OutputViewAllocatable dataView("BasisValues HCURL CURL data", numFields, numPoints, spaceDim); + return dataView; + } + else + { + // scalar-valued curl + OutputViewAllocatable dataView("BasisValues HCURL CURL data (scalar)", numFields, numPoints); + return dataView; + } + } + else + { + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()"); + } + case FUNCTION_SPACE_HVOL: + if (operatorType == OPERATOR_VALUE) + { + // vector-valued container + OutputViewAllocatable dataView("BasisValues HVOL VALUE data", numFields, numPoints); + return dataView; + } + else if (operatorIsDk) + { + ordinal_type dkCardinality = getDkCardinality(operatorType, spaceDim); + OutputViewAllocatable dataView("BasisValues HVOL Dk data", numFields, numPoints, dkCardinality); + return dataView; + } + else + { + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()"); + } + default: + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()"); + } + } } #endif diff --git a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp index 9313684fe262..f02256131951 100644 --- a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp +++ b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp @@ -399,7 +399,7 @@ namespace Intrepid2 { } }); - if (composedTransform_.underlyingMatchesNotional()) + if (composedTransform_.underlyingMatchesLogical()) { const auto & composedTransformView = composedTransform_.getUnderlyingView4(); Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) { diff --git a/packages/intrepid2/src/Shared/Intrepid2_ArgExtractor.hpp b/packages/intrepid2/src/Shared/Intrepid2_ArgExtractor.hpp new file mode 100644 index 000000000000..74a5fbba01c5 --- /dev/null +++ b/packages/intrepid2/src/Shared/Intrepid2_ArgExtractor.hpp @@ -0,0 +1,214 @@ +// @HEADER +// ************************************************************************ +// +// Intrepid2 Package +// Copyright (2007) Sandia Corporation +// +// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive +// license for use of this work by or on behalf of the U.S. Government. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Kyungjoo Kim (kyukim@sandia.gov), +// Mauro Perego (mperego@sandia.gov), or +// Nate Roberts (nvrober@sandia.gov) +// +// ************************************************************************ +// @HEADER + +/** \file Intrepid2_ArgExtractor.hpp + \brief Header file with various static argument-extractor classes. These are useful for writing efficient, templated code in terms of a subset of the arguments passed into a specified functor. See Intrepid2::Data, and specifically its storeInPlaceCombination() implementation, for an example. + \author Created by Nate Roberts. +*/ + +#ifndef __Intrepid2_ArgExtractor_HPP__ +#define __Intrepid2_ArgExtractor_HPP__ + +#include "Intrepid2_ConfigDefs.hpp" +#include "Intrepid2_DeviceAssert.hpp" +#include "Intrepid2_Types.hpp" +#include "Intrepid2_Utils.hpp" + +#include "Kokkos_Core.hpp" + +namespace Intrepid2 { + /** \class Intrepid2::ConstantArgExtractor + \brief Argument extractor class which ignores the input arguments in favor of passing a single 0 argument to the provided container. + */ + template + struct ConstantArgExtractor + { + template + static KOKKOS_INLINE_FUNCTION reference_type get(const ViewType &view, const IntArgs&... intArgs) + { + return view(0); + } + }; + + /** \class Intrepid2::FullArgExtractor + \brief Argument extractor class which passes all arguments to the provided container. + */ + template + struct FullArgExtractor + { + template + static KOKKOS_INLINE_FUNCTION reference_type get(const ViewType &view, const IntArgs&... intArgs) + { + return view(intArgs...); + } + }; + + /** \class Intrepid2::SingleArgExtractor + \brief Argument extractor class which passes a single argument, indicated by the template parameter whichArg, to the provided container. + */ + template + struct SingleArgExtractor + { + template< bool B, class T = reference_type > + using enable_if_t = typename std::enable_if::type; + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const int_type &i0) + { + return view(i0); + } + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const int_type &i0, const IntArgs&... intArgs) + { + return view(i0); + } + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const int_type &i0, const int_type &i1) + { + return view(i1); + } + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const int_type &i0, const int_type &i1, const IntArgs&... intArgs) + { + return view(i1); + } + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2) + { + return view(i2); + } + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const IntArgs&... intArgs) + { + return view(i2); + } + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const int_type &i3) + { + return view(i3); + } + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const int_type &i3, const IntArgs&... intArgs) + { + return view(i3); + } + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const int_type &i3, const int_type &i4) + { + return view(i4); + } + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const int_type &i3, const int_type &i4, const IntArgs&... intArgs) + { + return view(i4); + } + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const int_type &i3, const int_type &i4, const int_type &i5) + { + return view(i5); + } + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const int_type &i3, const int_type &i4, const int_type &i5, const IntArgs&... intArgs) + { + return view(i5); + } + + // the commented-out code below is a cleaner way to implement the above, but we can't support this on CUDA until we can require KOKKOS_ENABLE_CUDA_CONSTEXPR + /* + template + static KOKKOS_INLINE_FUNCTION + enable_if_t + get(const ViewType &view, const IntArgs&... intArgs) + { + const auto & arg = std::get(std::tuple(intArgs...)); + return view(arg); + } + */ + + template + static KOKKOS_INLINE_FUNCTION + enable_if_t= sizeof...(IntArgs), reference_type> + get(const ViewType &view, const IntArgs&... intArgs) + { + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"calling SingleArgExtractor with out-of-bounds argument"); + Kokkos::abort("Intrepid2::SingleArgExtractor: calling SingleArgExtractor with out-of-bounds argument\n"); + return view(0); // this line added to avoid missing return statement warning under nvcc + } + }; +} +#endif diff --git a/packages/intrepid2/src/Shared/Intrepid2_Data.hpp b/packages/intrepid2/src/Shared/Intrepid2_Data.hpp index feccd646e6a5..1495bee47c3e 100644 --- a/packages/intrepid2/src/Shared/Intrepid2_Data.hpp +++ b/packages/intrepid2/src/Shared/Intrepid2_Data.hpp @@ -8,11 +8,12 @@ #ifndef Intrepid2_Data_h #define Intrepid2_Data_h +#include "Intrepid2_ArgExtractor.hpp" #include "Intrepid2_ScalarView.hpp" #include "Intrepid2_Utils.hpp" /** \file Intrepid2_Data.hpp - \brief Defines the Data class, a wrapper around a Kokkos::View that allows data that is constant or repeating in various notional dimensions to be stored just once, while providing a similar interface to that of View. + \brief Defines the Data class, a wrapper around a Kokkos::View that allows data that is constant or repeating in various logical dimensions to be stored just once, while providing a similar interface to that of View. \author Created by N.V. Roberts. */ @@ -44,7 +45,7 @@ namespace Intrepid2 { DataVariationType variationType; int dataExtent; int variationModulus; // should be equal to dataExtent variationType other than MODULAR and CONSTANT - int blockPlusDiagonalFirstNonDiagonal = -1; // only relevant for variationType == BLOCK_PLUS_DIAGONAL + int blockPlusDiagonalLastNonDiagonal = -1; // only relevant for variationType == BLOCK_PLUS_DIAGONAL }; //! Returns DimensionInfo for a Data container that combines (through multiplication, say, or addition) the two specified DimensionInfo specifications in one of its dimensions. @@ -134,8 +135,8 @@ namespace Intrepid2 { combinedDimensionInfo.variationType = GENERAL; combinedDimensionInfo.dataExtent = max(myDataExtent,otherDataExtent); combinedDimensionInfo.variationModulus = combinedDimensionInfo.dataExtent; - // for this case, we want to take the minimum of the two Data objects' blockPlusDiagonalFirstNonDiagonal as the combined object's blockPlusDiagonalFirstNonDiagonal - combinedDimensionInfo.blockPlusDiagonalFirstNonDiagonal = min(myData.blockPlusDiagonalFirstNonDiagonal, otherData.blockPlusDiagonalFirstNonDiagonal); + // for this case, we want to take the minimum of the two Data objects' blockPlusDiagonalLastNonDiagonal as the combined object's blockPlusDiagonalLastNonDiagonal + combinedDimensionInfo.blockPlusDiagonalLastNonDiagonal = min(myData.blockPlusDiagonalLastNonDiagonal, otherData.blockPlusDiagonalLastNonDiagonal); } break; case GENERAL: @@ -153,19 +154,19 @@ namespace Intrepid2 { /** \class Intrepid2::Data - \brief Wrapper around a Kokkos::View that allows data that is constant or repeating in various notional dimensions to be stored just once, while providing a similar interface to that of View. + \brief Wrapper around a Kokkos::View that allows data that is constant or repeating in various logical dimensions to be stored just once, while providing a similar interface to that of View. - The Data class distinguishes between the notional extent and the data extent. For example, one could construct a data container corresponding to constant (cell, point) data with 100 cells + The Data class distinguishes between the logical extent and the data extent. For example, one could construct a data container corresponding to constant (cell, point) data with 100 cells and 25 points per cell as follows: auto cpData = Data(value, Kokkos::Array{100,25}); - The data extent of the container is 1 in every dimension, while the notional extent is 100 in the first dimension, and 25 in the second. Similarly, the notional rank of the container is 2, but the rank of the + The data extent of the container is 1 in every dimension, while the logical extent is 100 in the first dimension, and 25 in the second. Similarly, the logical rank of the container is 2, but the rank of the underlying View is 1. - There are four possible variation types in a notional dimension: - - GENERAL: the data varies arbitrarily. The underlying View will have the same extent in its corresponding dimension (which may be distinct from the notional dimension). + There are four possible variation types in a logical dimension: + - GENERAL: the data varies arbitrarily. The underlying View will have the same extent in its corresponding dimension (which may be distinct from the logical dimension). - CONSTANT: the data does not vary. The underlying View will not have a dimension corresponding to this dimension. - MODULAR: the data varies with a modulus. The underlying View will have a corresponding dimension with extent corresponding to the modulus. - - BLOCK_PLUS_DIAGONAL: the data varies in this notional dimension and one other, corresponding to a square matrix that has some (possibly trivial) full block, with diagonal entries in the remaining dimensions. The underlying View will have one dimension corresponding to the two notional dimensions, with extent corresponding to the number of nonzeros in the matrix. + - BLOCK_PLUS_DIAGONAL: the data varies in this logical dimension and one other, corresponding to a square matrix that has some (possibly trivial) full block, with diagonal entries in the remaining dimensions. The underlying View will have one dimension corresponding to the two logical dimensions, with extent corresponding to the number of nonzeros in the matrix. */ template @@ -187,8 +188,8 @@ namespace Intrepid2 { Kokkos::Array variationModulus_; // for each dimension, a value by which indices should be modulused (only used when variationType_ is MODULAR) int blockPlusDiagonalLastNonDiagonal_ = -1; // last row/column that is part of the non-diagonal part of the matrix indicated by BLOCK_PLUS_DIAGONAL (if any dimensions are thus marked) - bool hasNontrivialModulusUNUSED_; // this is a little nutty, but having this UNUSED member variable improves performance, probably by shifting the alignment of underlyingMatchesNotional_. This is true with nvcc; it may also be true with Apple clang - bool underlyingMatchesNotional_; // if true, this Data object has the same rank and extent as the underlying view + bool hasNontrivialModulusUNUSED_; // this is a little nutty, but having this UNUSED member variable improves performance, probably by shifting the alignment of underlyingMatchesLogical_. This is true with nvcc; it may also be true with Apple clang + bool underlyingMatchesLogical_; // if true, this Data object has the same rank and extent as the underlying view Kokkos::Array activeDims_; int numActiveDims_; // how many of the 7 entries are actually filled in @@ -253,7 +254,7 @@ namespace Intrepid2 { numActiveDims_ = 0; int blockPlusDiagonalCount = 0; - underlyingMatchesNotional_ = true; + underlyingMatchesLogical_ = true; for (ordinal_type i=0; i<7; i++) { if (variationType_[i] == GENERAL) @@ -271,7 +272,7 @@ namespace Intrepid2 { } else if (variationType_[i] == MODULAR) { - underlyingMatchesNotional_ = false; + underlyingMatchesLogical_ = false; if (extents_[i] != getUnderlyingViewExtent(numActiveDims_)) { const int dataExtent = getUnderlyingViewExtent(numActiveDims_); @@ -291,7 +292,7 @@ namespace Intrepid2 { } else if (variationType_[i] == BLOCK_PLUS_DIAGONAL) { - underlyingMatchesNotional_ = false; + underlyingMatchesLogical_ = false; blockPlusDiagonalCount++; if (blockPlusDiagonalCount == 1) // first dimension thus marked --> active { @@ -318,7 +319,7 @@ namespace Intrepid2 { { if (i < rank_) { - underlyingMatchesNotional_ = false; + underlyingMatchesLogical_ = false; } variationModulus_[i] = 1; // trivial modulus } @@ -326,7 +327,7 @@ namespace Intrepid2 { if (rank_ != dataRank_) { - underlyingMatchesNotional_ = false; + underlyingMatchesLogical_ = false; } for (int d=numActiveDims_; d<7; d++) @@ -340,75 +341,608 @@ namespace Intrepid2 { INTREPID2_TEST_FOR_EXCEPTION(variationModulus_[d] == 0, std::logic_error, "variationModulus should not ever be 0"); } } - + public: - //! Returns an l-value reference to the specified nominal entry in the underlying view. Note that for variation types other than GENERAL, multiple valid argument sets will refer to the same memory location. Intended for Intrepid2 developers and expert users only. - KOKKOS_INLINE_FUNCTION - reference_type getWritableEntry(const int & i0, const int & i1, const int & i2, - const int & i3, const int & i4, const int & i5, - const int & i6) const + //! For use with Data object into which a value will be stored. + struct FullArgExtractorWritableData { - if (underlyingMatchesNotional_) + template + static KOKKOS_INLINE_FUNCTION reference_type get(const ViewType &view, const IntArgs&... intArgs) { - switch (dataRank_) + return view.getWritableEntry(intArgs...); + } + }; + + template + struct InPlaceCombinationFunctor + { + private: + ThisUnderlyingViewType this_underlying_; + AUnderlyingViewType A_underlying_; + BUnderlyingViewType B_underlying_; + BinaryOperator binaryOperator_; + int innerLoopSize_; + public: + InPlaceCombinationFunctor(ThisUnderlyingViewType this_underlying, AUnderlyingViewType A_underlying, BUnderlyingViewType B_underlying, + BinaryOperator binaryOperator) + : + this_underlying_(this_underlying), + A_underlying_(A_underlying), + B_underlying_(B_underlying), + binaryOperator_(binaryOperator) + { + INTREPID2_TEST_FOR_EXCEPTION(includeInnerLoop,std::invalid_argument,"If includeInnerLoop is true, must specify the size of the inner loop"); + } + + InPlaceCombinationFunctor(ThisUnderlyingViewType this_underlying, AUnderlyingViewType A_underlying, BUnderlyingViewType B_underlying, + BinaryOperator binaryOperator, int innerLoopSize) + : + this_underlying_(this_underlying), + A_underlying_(A_underlying), + B_underlying_(B_underlying), + binaryOperator_(binaryOperator), + innerLoopSize_(innerLoopSize) + { + INTREPID2_TEST_FOR_EXCEPTION(includeInnerLoop,std::invalid_argument,"If includeInnerLoop is true, must specify the size of the inner loop"); + } + + template + KOKKOS_INLINE_FUNCTION + enable_if_t + operator()(const IntArgs&... args) const + { + auto & result = ArgExtractorThis::get( this_underlying_, args... ); + const auto & A_val = ArgExtractorA::get( A_underlying_, args... ); + const auto & B_val = ArgExtractorB::get( B_underlying_, args... ); + + result = binaryOperator_(A_val,B_val); + } + + template + KOKKOS_INLINE_FUNCTION + enable_if_t + operator()(const IntArgs&... args) const + { + using int_type = std::tuple_element_t<0, std::tuple>; + for (int_type iFinal=0; iFinal(innerLoopSize_); iFinal++) { - case 1: return data1_.access(i0,i1,i2,i3,i4,i5,i6);; - case 2: return data2_.access(i0,i1,i2,i3,i4,i5,i6);; - case 3: return data3_.access(i0,i1,i2,i3,i4,i5,i6);; - case 4: return data4_.access(i0,i1,i2,i3,i4,i5,i6);; - case 5: return data5_.access(i0,i1,i2,i3,i4,i5,i6);; - case 6: return data6_.access(i0,i1,i2,i3,i4,i5,i6);; - case 7: return data7_.access(i0,i1,i2,i3,i4,i5,i6);; - default: - INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "invalid dataRank_"); + auto & result = ArgExtractorThis::get( this_underlying_, args..., iFinal ); + const auto & A_val = ArgExtractorA::get( A_underlying_, args..., iFinal ); + const auto & B_val = ArgExtractorB::get( B_underlying_, args..., iFinal ); + + result = binaryOperator_(A_val,B_val); } } + }; + + //! storeInPlaceCombination implementation for rank < 7, with compile-time underlying views and argument interpretation. Intended for internal and expert use. + template + void storeInPlaceCombination(PolicyType &policy, ThisUnderlyingViewType &this_underlying, + AUnderlyingViewType &A_underlying, BUnderlyingViewType &B_underlying, + BinaryOperator &binaryOperator, ArgExtractorThis argThis, ArgExtractorA argA, ArgExtractorB argB) + { + using Functor = InPlaceCombinationFunctor; + Functor functor(this_underlying, A_underlying, B_underlying, binaryOperator); + Kokkos::parallel_for("compute in-place", policy, functor); + } + + //! storeInPlaceCombination with compile-time rank -- implementation for rank < 7. + template + enable_if_t + storeInPlaceCombination(const Data &A, const Data &B, BinaryOperator binaryOperator) + { + auto policy = dataExtentRangePolicy(); + using PolicyType = decltype(policy); + + // shallow copy of this to avoid implicit references to this in calls to getWritableEntry() below + Data thisData = *this; - const Kokkos::Array args {i0,i1,i2,i3,i4,i5,i6}; - Kokkos::Array refEntry; + const bool A_1D = A.getUnderlyingViewRank() == 1; + const bool B_1D = B.getUnderlyingViewRank() == 1; + const bool this_1D = this->getUnderlyingViewRank() == 1; + const bool A_constant = A_1D && (A.getUnderlyingViewSize() == 1); + const bool B_constant = B_1D && (B.getUnderlyingViewSize() == 1); + const bool this_constant = this_1D && (this->getUnderlyingViewSize() == 1); + const bool A_full = A.underlyingMatchesLogical(); + const bool B_full = B.underlyingMatchesLogical(); + const bool this_full = this->underlyingMatchesLogical(); - for (int d=0; d<7; d++) + const ConstantArgExtractor constArg; + + const FullArgExtractor fullArgs; + const FullArgExtractor fullArgsConst; + const FullArgExtractorWritableData fullArgsWritable; + + const SingleArgExtractor arg0; + const SingleArgExtractor arg1; + const SingleArgExtractor arg2; + const SingleArgExtractor arg3; + const SingleArgExtractor arg4; + const SingleArgExtractor arg5; + + // this lambda returns -1 if there is not a rank-1 underlying view whose data extent matches the logical extent in the corresponding dimension; + // otherwise, it returns the logical index of the corresponding dimension. + auto get1DArgIndex = [](const Data &data) -> int + { + const auto & variationTypes = data.getVariationTypes(); + for (int d=0; dgetUnderlyingView<1>(); + auto & A_underlying = A.getUnderlyingView<1>(); + auto & B_underlying = B.getUnderlyingView<1>(); + storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, BAE); + } + else if (this_full && A_full && B_full) + { + auto thisAE = fullArgs; + auto AAE = fullArgs; + auto BAE = fullArgs; + + auto & this_underlying = this->getUnderlyingView(); + auto & A_underlying = A.getUnderlyingView(); + auto & B_underlying = B.getUnderlyingView(); + + storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, BAE); + } + else if (A_constant) + { + auto AAE = constArg; + auto & A_underlying = A.getUnderlyingView<1>(); + if (this_full) + { + auto thisAE = fullArgs; + auto & this_underlying = this->getUnderlyingView(); + + if (B_full) + { + auto BAE = fullArgs; + auto & B_underlying = B.getUnderlyingView(); + storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, BAE); + } + else // this_full, not B_full: B may have modular data, etc. + { + auto BAE = fullArgsConst; + storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, AAE, BAE); + } + } + else // this is not full + { + // below, we optimize for the case of 1D data in B, when A is constant. Still need to handle other cases… + if (B_1D && (get1DArgIndex(B) != -1) ) + { + // since A is constant, that implies that this_1D is true, and has the same 1DArgIndex + const int argIndex = get1DArgIndex(B); + auto & B_underlying = B.getUnderlyingView<1>(); + auto & this_underlying = this->getUnderlyingView<1>(); + switch (argIndex) + { + case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg0, AAE, arg0); break; + case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg1, AAE, arg1); break; + case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg2, AAE, arg2); break; + case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg3, AAE, arg3); break; + case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg4, AAE, arg4); break; + case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg5, AAE, arg5); break; + default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index"); + } + } + else + { + // since storing to Data object requires a call to getWritableEntry(), we use FullArgExtractorWritableData + auto thisAE = fullArgsWritable; + auto BAE = fullArgsConst; + storeInPlaceCombination(policy, thisData, A_underlying, B, binaryOperator, thisAE, AAE, BAE); + } + } + } + else if (B_constant) { - if (variationType_[d] == GENERAL) + auto BAE = constArg; + auto & B_underlying = B.getUnderlyingView<1>(); + if (this_full) { - refEntry[d] = args[d]; + auto thisAE = fullArgs; + auto & this_underlying = this->getUnderlyingView(); + if (A_full) + { + auto AAE = fullArgs; + auto & A_underlying = A.getUnderlyingView(); + + storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, BAE); + } + else // this_full, not A_full: A may have modular data, etc. + { + // use A (the Data object). This could be further optimized by using A's underlying View and an appropriately-defined ArgExtractor. + auto AAE = fullArgsConst; + storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, thisAE, AAE, BAE); + } } - else if (variationType_[d] == MODULAR) + else // this is not full { - refEntry[d] = args[d] % variationModulus_[d]; + // below, we optimize for the case of 1D data in A, when B is constant. Still need to handle other cases… + if (A_1D && (get1DArgIndex(A) != -1) ) + { + // since B is constant, that implies that this_1D is true, and has the same 1DArgIndex as A + const int argIndex = get1DArgIndex(A); + auto & A_underlying = A.getUnderlyingView<1>(); + auto & this_underlying = this->getUnderlyingView<1>(); + switch (argIndex) + { + case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg0, arg0, BAE); break; + case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg1, arg1, BAE); break; + case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg2, arg2, BAE); break; + case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg3, arg3, BAE); break; + case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg4, arg4, BAE); break; + case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg5, arg5, BAE); break; + default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index"); + } + } + else + { + // since storing to Data object requires a call to getWritableEntry(), we use FullArgExtractorWritableData + auto thisAE = fullArgsWritable; + auto AAE = fullArgsConst; + storeInPlaceCombination(policy, thisData, A, B_underlying, binaryOperator, thisAE, AAE, BAE); + } } - else if (variationType_[d] == BLOCK_PLUS_DIAGONAL) + } + else // neither A nor B constant + { + if (this_1D && (get1DArgIndex(thisData) != -1)) { - const int numNondiagonalEntries = blockPlusDiagonalNumNondiagonalEntries(blockPlusDiagonalLastNonDiagonal_); + // possible ways that "this" could have full-extent, 1D data + // 1. A constant, B 1D + // 2. A 1D, B constant + // 3. A 1D, B 1D + // The constant possibilities are already addressed above, leaving us with (3). Note that A and B don't have to be full-extent, however + const int argThis = get1DArgIndex(thisData); + const int argA = get1DArgIndex(A); // if not full-extent, will be -1 + const int argB = get1DArgIndex(B); // ditto - const int &i = args[d]; - const int &j = args[d+1]; + auto & A_underlying = A.getUnderlyingView<1>(); + auto & B_underlying = B.getUnderlyingView<1>(); + auto & this_underlying = this->getUnderlyingView<1>(); + if ((argA != -1) && (argB != -1)) + { +#ifdef INTREPID2_HAVE_DEBUG + INTREPID2_TEST_FOR_EXCEPTION(argA != argThis, std::logic_error, "Unexpected 1D arg combination."); + INTREPID2_TEST_FOR_EXCEPTION(argB != argThis, std::logic_error, "Unexpected 1D arg combination."); +#endif + switch (argThis) + { + case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg0, arg0, arg0); break; + case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg1, arg1, arg1); break; + case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg2, arg2, arg2); break; + case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg3, arg3, arg3); break; + case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg4, arg4, arg4); break; + case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg5, arg5, arg5); break; + default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index"); + } + } + else if (argA != -1) + { + // B is not full-extent in dimension argThis; use the Data object + switch (argThis) + { + case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg0, arg0, fullArgsConst); break; + case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg1, arg1, fullArgsConst); break; + case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg2, arg2, fullArgsConst); break; + case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg3, arg3, fullArgsConst); break; + case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg4, arg4, fullArgsConst); break; + case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg5, arg5, fullArgsConst); break; + default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index"); + } + } + else + { + // A is not full-extent in dimension argThis; use the Data object + switch (argThis) + { + case 0: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg0, fullArgsConst, arg0); break; + case 1: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg1, fullArgsConst, arg1); break; + case 2: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg2, fullArgsConst, arg2); break; + case 3: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg3, fullArgsConst, arg3); break; + case 4: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg4, fullArgsConst, arg4); break; + case 5: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg5, fullArgsConst, arg5); break; + default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index"); + } + } + } + else if (this_full) + { + // This case uses A,B Data objects; could be optimized by dividing into subcases and using underlying Views with appropriate ArgExtractors. + auto & this_underlying = this->getUnderlyingView(); + auto thisAE = fullArgs; - if ((i > blockPlusDiagonalLastNonDiagonal_) || (j > blockPlusDiagonalLastNonDiagonal_)) + if (A_full) { - if (i != j) + auto & A_underlying = A.getUnderlyingView(); + auto AAE = fullArgs; + + if (B_1D && (get1DArgIndex(B) != -1)) { - // off diagonal: zero - return zeroView_(0); // NOTE: this branches in an argument-dependent way; this is not great for CUDA performance. When using BLOCK_PLUS_DIAGONAL, should generally avoid calls to this getEntry() method. (Use methods that directly take advantage of the data packing instead.) + const int argIndex = get1DArgIndex(B); + auto & B_underlying = B.getUnderlyingView<1>(); + switch (argIndex) + { + case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, arg0); break; + case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, arg1); break; + case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, arg2); break; + case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, arg3); break; + case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, arg4); break; + case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, arg5); break; + default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index"); + } } else { - refEntry[d] = blockPlusDiagonalDiagonalEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i); + // A is full; B is not full, but not constant or full-extent 1D + // unoptimized in B access: + auto BAE = fullArgsConst; + storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, AAE, BAE); } } - else + else // A is not full { - refEntry[d] = blockPlusDiagonalBlockEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i, j); + if (A_1D && (get1DArgIndex(A) != -1)) + { + const int argIndex = get1DArgIndex(A); + auto & A_underlying = A.getUnderlyingView<1>(); + if (B_full) + { + auto & B_underlying = B.getUnderlyingView(); + auto BAE = fullArgs; + switch (argIndex) + { + case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, arg0, BAE); break; + case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, arg1, BAE); break; + case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, arg2, BAE); break; + case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, arg3, BAE); break; + case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, arg4, BAE); break; + case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, arg5, BAE); break; + default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index"); + } + } + else + { + auto BAE = fullArgsConst; + switch (argIndex) + { + case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg0, BAE); break; + case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg1, BAE); break; + case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg2, BAE); break; + case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg3, BAE); break; + case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg4, BAE); break; + case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg5, BAE); break; + default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index"); + } + } + } + else // A not full, and not full-extent 1D + { + // unoptimized in A, B accesses. + auto AAE = fullArgsConst; + auto BAE = fullArgsConst; + storeInPlaceCombination(policy, this_underlying, A, B, binaryOperator, thisAE, AAE, BAE); + } } + } + else + { + // completely un-optimized case: we use Data objects for this, A, B. + auto thisAE = fullArgsWritable; + auto AAE = fullArgsConst; + auto BAE = fullArgsConst; + storeInPlaceCombination(policy, thisData, A, B, binaryOperator, thisAE, AAE, BAE); + } + } + } + + //! storeInPlaceCombination with compile-time rank -- implementation for rank of 7. (Not optimized; expectation is this case will be rarely used.) + template + enable_if_t + storeInPlaceCombination(const Data &A, const Data &B, BinaryOperator binaryOperator) + { + auto policy = dataExtentRangePolicy(); + + using DataType = Data; + using ThisAE = FullArgExtractorWritableData; + using AAE = FullArgExtractor; + using BAE = FullArgExtractor; + + const ordinal_type dim6 = getDataExtent(6); + const bool includeInnerLoop = true; + using Functor = InPlaceCombinationFunctor; + Functor functor(*this, A, B, binaryOperator, dim6); + Kokkos::parallel_for("compute in-place", policy, functor); + } + public: + //! applies the specified unary operator to each entry + template + void applyOperator(UnaryOperator unaryOperator) + { + using ExecutionSpace = typename DeviceType::execution_space; + + switch (dataRank_) + { + case 1: + { + const int dataRank = 1; + auto view = getUnderlyingView(); + + const int dataExtent = this->getDataExtent(0); + Kokkos::RangePolicy policy(ExecutionSpace(),0,dataExtent); + Kokkos::parallel_for("apply operator in-place", policy, + KOKKOS_LAMBDA (const int &i0) { + view(i0) = unaryOperator(view(i0)); + }); - // skip next d (this is required also to be BLOCK_PLUS_DIAGONAL, and we've consumed its arg as j above) - refEntry[d+1] = 0; - d++; } - else if (variationType_[d] == CONSTANT) + break; + case 2: { - refEntry[d] = 0; + const int dataRank = 2; + auto policy = dataExtentRangePolicy(); + auto view = getUnderlyingView(); + + Kokkos::parallel_for("apply operator in-place", policy, + KOKKOS_LAMBDA (const int &i0, const int &i1) { + view(i0,i1) = unaryOperator(view(i0,i1)); + }); } + break; + case 3: + { + const int dataRank = 3; + auto policy = dataExtentRangePolicy(); + auto view = getUnderlyingView(); + + Kokkos::parallel_for("apply operator in-place", policy, + KOKKOS_LAMBDA (const int &i0, const int &i1, const int &i2) { + view(i0,i1,i2) = unaryOperator(view(i0,i1,i2)); + }); + } + break; + case 4: + { + const int dataRank = 4; + auto policy = dataExtentRangePolicy(); + auto view = getUnderlyingView(); + + Kokkos::parallel_for("apply operator in-place", policy, + KOKKOS_LAMBDA (const int &i0, const int &i1, const int &i2, const int &i3) { + view(i0,i1,i2,i3) = unaryOperator(view(i0,i1,i2,i3)); + }); + } + break; + case 5: + { + const int dataRank = 5; + auto policy = dataExtentRangePolicy(); + auto view = getUnderlyingView(); + + Kokkos::parallel_for("apply operator in-place", policy, + KOKKOS_LAMBDA (const int &i0, const int &i1, const int &i2, const int &i3, const int &i4) { + view(i0,i1,i2,i3,i4) = unaryOperator(view(i0,i1,i2,i3,i4)); + }); + } + break; + case 6: + { + const int dataRank = 6; + auto policy = dataExtentRangePolicy(); + auto view = getUnderlyingView(); + + Kokkos::parallel_for("apply operator in-place", policy, + KOKKOS_LAMBDA (const int &i0, const int &i1, const int &i2, const int &i3, const int &i4, const int &i5) { + view(i0,i1,i2,i3,i4,i5) = unaryOperator(view(i0,i1,i2,i3,i4,i5)); + }); + } + break; + case 7: + { + const int dataRank = 7; + auto policy6 = dataExtentRangePolicy<6>(); + auto view = getUnderlyingView(); + + const int dim_i6 = view.extent_int(6); + + Kokkos::parallel_for("apply operator in-place", policy6, + KOKKOS_LAMBDA (const int &i0, const int &i1, const int &i2, const int &i3, const int &i4, const int &i5) { + for (int i6=0; i6 + KOKKOS_INLINE_FUNCTION + reference_type getWritableEntry(const IntArgs... intArgs) const + { +#ifdef INTREPID2_HAVE_DEBUG + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(numArgs != rank_, std::invalid_argument, "getWritableEntry() should have the same number of arguments as the logical rank."); +#endif + constexpr int numArgs = sizeof...(intArgs); + if (underlyingMatchesLogical_) + { + // in this case, we require that numArgs == dataRank_ + return getUnderlyingView()(intArgs...); + } + + // extract the type of the first argument; use that for the arrays below + using int_type = std::tuple_element_t<0, std::tuple>; + + const Kokkos::Array args {intArgs...}; + Kokkos::Array refEntry; + for (int d=0; d= numArgs) + { + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::invalid_argument, "BLOCK_PLUS_DIAGONAL must be present for two dimensions; here, encountered only one"); + } + else + { + const int_type &j = args[d+1]; + + if ((i > static_cast(blockPlusDiagonalLastNonDiagonal_)) || (j > static_cast(blockPlusDiagonalLastNonDiagonal_))) + { + if (i != j) + { + // off diagonal: zero + return zeroView_(0); // NOTE: this branches in an argument-dependent way; this is not great for CUDA performance. When using BLOCK_PLUS_DIAGONAL, should generally avoid calls to this getEntry() method. (Use methods that directly take advantage of the data packing instead.) + } + else + { + refEntry[d] = blockPlusDiagonalDiagonalEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i); + } + } + else + { + refEntry[d] = blockPlusDiagonalBlockEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i, j); + } + + // skip next d (this is required also to be BLOCK_PLUS_DIAGONAL, and we've consumed its arg as j above) + refEntry[d+1] = 0; + } + d++; + } + } + } + // refEntry should be zero-filled beyond numArgs, for cases when rank_ < dataRank_ (this only is allowed if the extra dimensions each has extent 1). + for (int d=numArgs; d<7; d++) + { + refEntry[d] = 0; } if (dataRank_ == 1) @@ -489,6 +1023,57 @@ namespace Intrepid2 { } } + //! Constructor in terms of DimensionInfo for each nominal dimension; does not require a View to be specified. Will allocate a View of appropriate rank, zero-filled. + Data(std::vector dimInfoVector) + : + // initialize member variables as if default constructor; if dimInfoVector is empty, we want default constructor behavior. + dataRank_(0), extents_({0,0,0,0,0,0,0}), variationType_({CONSTANT,CONSTANT,CONSTANT,CONSTANT,CONSTANT,CONSTANT,CONSTANT}), blockPlusDiagonalLastNonDiagonal_(-1), rank_(dimInfoVector.size()) + { + // If dimInfoVector is empty, the member initialization above is correct; otherwise, we set as below. + // Either way, once members are initialized, we must call setActiveDims(). + if (dimInfoVector.size() != 0) + { + std::vector dataExtents; + + bool blockPlusDiagonalEncountered = true; + for (int d=0; d("Intrepid2 Data", dataExtents[0]); break; + case 2: data2_ = Kokkos::View("Intrepid2 Data", dataExtents[0], dataExtents[1]); break; + case 3: data3_ = Kokkos::View("Intrepid2 Data", dataExtents[0], dataExtents[1], dataExtents[2]); break; + case 4: data4_ = Kokkos::View("Intrepid2 Data", dataExtents[0], dataExtents[1], dataExtents[2], dataExtents[3]); break; + case 5: data5_ = Kokkos::View("Intrepid2 Data", dataExtents[0], dataExtents[1], dataExtents[2], dataExtents[3], dataExtents[4]); break; + case 6: data6_ = Kokkos::View("Intrepid2 Data", dataExtents[0], dataExtents[1], dataExtents[2], dataExtents[3], dataExtents[4], dataExtents[5]); break; + case 7: data7_ = Kokkos::View("Intrepid2 Data", dataExtents[0], dataExtents[1], dataExtents[2], dataExtents[3], dataExtents[4], dataExtents[5], dataExtents[6]); break; + default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid data rank"); + } + } + setActiveDims(); + } + //! DynRankView constructor. Will copy to a View of appropriate rank. Data(const ScalarView &data, int rank, Kokkos::Array extents, Kokkos::Array variationType, const int blockPlusDiagonalLastNonDiagonal = -1) : @@ -779,14 +1364,14 @@ namespace Intrepid2 { if (dimInfo.variationType == BLOCK_PLUS_DIAGONAL) { - dimInfo.blockPlusDiagonalFirstNonDiagonal = blockPlusDiagonalLastNonDiagonal_; + dimInfo.blockPlusDiagonalLastNonDiagonal = blockPlusDiagonalLastNonDiagonal_; } return dimInfo; } //! Returns (DataVariationType, data extent) in the specified dimension for a Data container that combines (through multiplication, say, or addition) this container with otherData. KOKKOS_INLINE_FUNCTION - DimensionInfo combinedDimensionInfo(const Data &otherData, const int &dim) const + DimensionInfo combinedDataDimensionInfo(const Data &otherData, const int &dim) const { const DimensionInfo myDimInfo = getDimensionInfo(dim); const DimensionInfo otherDimInfo = otherData.getDimensionInfo(dim); @@ -999,7 +1584,7 @@ namespace Intrepid2 { return dataRank_; } - //! returns the rank of the View that stores the unique data + //! returns the number of entries in the View that stores the unique data KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const { @@ -1077,7 +1662,7 @@ namespace Intrepid2 { } } - //! returns the true extent of the data corresponding to the notional dimension provided; if the data does not vary in that dimension, returns 1 + //! returns the true extent of the data corresponding to the logical dimension provided; if the data does not vary in that dimension, returns 1 KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const { for (unsigned i=0; i & getVariationTypes() const { return variationType_; } - //! Returns a value corresponding to the specified notional data location. - template - KOKKOS_INLINE_FUNCTION typename std::enable_if< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - return_type>::type - getEntry(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5, - const iType6& i6) const - { - const Kokkos::Array args {static_cast(i0),static_cast(i1),static_cast(i2), - static_cast(i3),static_cast(i4),static_cast(i5), - static_cast(i6)}; - Kokkos::Array refEntry; - - for (int d=0; d<7; d++) - { - if (variationType_[d] == GENERAL) - { - refEntry[d] = args[d]; - } - else if (variationType_[d] == MODULAR) - { - refEntry[d] = args[d] % variationModulus_[d]; - } - else if (variationType_[d] == BLOCK_PLUS_DIAGONAL) - { - const int numNondiagonalEntries = blockPlusDiagonalNumNondiagonalEntries(blockPlusDiagonalLastNonDiagonal_); - - const int &i = args[d]; - const int &j = args[d+1]; - - if ((i > blockPlusDiagonalLastNonDiagonal_) || (j > blockPlusDiagonalLastNonDiagonal_)) - { - if (i != j) - { - // off diagonal: zero - return zeroView_(0); // NOTE: this branches in an argument-dependent way; this is not great for CUDA performance. When using BLOCK_PLUS_DIAGONAL, should generally avoid calls to this getEntry() method. (Use methods that directly take advantage of the data packing instead.) - } - else - { - refEntry[d] = blockPlusDiagonalDiagonalEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i); - } - } - else - { - refEntry[d] = blockPlusDiagonalBlockEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i, j); - } - - // skip next d (this is required also to be BLOCK_PLUS_DIAGONAL, and we've consumed its arg as j above) - refEntry[d+1] = 0; - d++; - } - else if (variationType_[d] == CONSTANT) - { - refEntry[d] = 0; - } - } - - if (dataRank_ == 1) - { - return data1_(refEntry[activeDims_[0]]); - } - else if (dataRank_ == 2) - { - return data2_(refEntry[activeDims_[0]],refEntry[activeDims_[1]]); - } - else if (dataRank_ == 3) - { - return data3_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]]); - } - else if (dataRank_ == 4) - { - return data4_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]]); - } - else if (dataRank_ == 5) - { - return data5_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]], - refEntry[activeDims_[4]]); - } - else if (dataRank_ == 6) - { - return data6_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]], - refEntry[activeDims_[4]],refEntry[activeDims_[5]]); - } - else // dataRank_ == 7 - { - return data7_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]], - refEntry[activeDims_[4]],refEntry[activeDims_[5]],refEntry[activeDims_[6]]); - } - } - - //! Returns a value corresponding to the specified notional data location. - template - KOKKOS_INLINE_FUNCTION typename std::enable_if< - (std::is_integral::value), - return_type>::type - operator()(const iType& i0) const { - if (underlyingMatchesNotional_) - { - return data1_(i0); - } - return getEntry(i0,0,0,0,0,0,0); - } - - //! Returns a value corresponding to the specified notional data location. - template - KOKKOS_INLINE_FUNCTION typename std::enable_if< - (std::is_integral::value && std::is_integral::value), - return_type>::type - operator()(const iType0& i0, const iType1& i1) const { - if (underlyingMatchesNotional_) - { - return data2_(i0,i1); - } - return getEntry(i0,i1,0,0,0,0,0); - } - - //! Returns a value corresponding to the specified notional data location. - template - KOKKOS_INLINE_FUNCTION typename std::enable_if< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value), - return_type>::type - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - if (underlyingMatchesNotional_) - { - return data3_(i0,i1,i2); - } - return getEntry(i0,i1,i2,0,0,0,0); - } - - //! Returns a value corresponding to the specified notional data location. - template - KOKKOS_INLINE_FUNCTION typename std::enable_if< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - return_type>::type - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - if (underlyingMatchesNotional_) - { - return data4_(i0,i1,i2,i3); - } - return getEntry(i0,i1,i2,i3,0,0,0); + //! Returns a (read-only) value corresponding to the specified logical data location. + template + KOKKOS_INLINE_FUNCTION + return_type getEntry(const IntArgs&... intArgs) const + { + return getWritableEntry(intArgs...); } - //! Returns a value corresponding to the specified notional data location. - template - KOKKOS_INLINE_FUNCTION typename std::enable_if< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - return_type>::type - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - if (underlyingMatchesNotional_) - { - return data5_(i0,i1,i2,i3,i4); - } - return getEntry(i0,i1,i2,i3,i4,0,0); - } + template struct bool_pack; + + template + using all_true = std::is_same, bool_pack>; - //! Returns a value corresponding to the specified notional data location. - template - KOKKOS_INLINE_FUNCTION typename std::enable_if< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - return_type>::type - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - if (underlyingMatchesNotional_) - { - return data6_(i0,i1,i2,i3,i4,i5); - } - return getEntry(i0,i1,i2,i3,i4,i5,0); - } + template + using valid_args = all_true{}...>; - //! Returns a value corresponding to the specified notional data location. - template - KOKKOS_INLINE_FUNCTION typename std::enable_if< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - return_type>::type - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5, - const iType6& i6) const { - if (underlyingMatchesNotional_) - { - return data7_(i0,i1,i2,i3,i4,i5,i6); - } - return getEntry(i0,i1,i2,i3,i4,i5,i6); + static_assert(valid_args::value, "valid args works"); + + //! Returns a value corresponding to the specified logical data location. + template + KOKKOS_INLINE_FUNCTION +#ifndef __INTEL_COMPILER + // icc has a bug that prevents compilation with this enable_if_t + // (possibly the same as https://community.intel.com/t5/Intel-C-Compiler/Intel-Compiler-bug-while-deducing-template-arguments-inside/m-p/1164358) + // so with icc we'll just skip the argument type/count check + enable_if_t::value && (sizeof...(IntArgs) <= 7),return_type> +#else + return_type +#endif + operator()(const IntArgs&... intArgs) const { + return getEntry(intArgs...); } - - //! Returns the notional extent in the specified dimension. + + //! Returns the logical extent in the specified dimension. KOKKOS_INLINE_FUNCTION int extent_int(const int& r) const { @@ -1352,6 +1767,25 @@ namespace Intrepid2 { return false; // statement should be unreachable; included because compilers don't necessarily recognize that fact... } + //! Constructs a container suitable for storing the result of an in-place combination of the two provided data containers. The two containers must have the same nominal shape. + //! \see storeInPlaceCombination() + //! \param A [in] - the first data container. + //! \param B [in] - the second data container. Must have the same nominal shape as A. + //! \return A container with the same nominal shape as A and B, with underlying View storage sufficient to store the result of A + B (or any other in-place combination). + static Data allocateInPlaceCombinationResult( const Data &A, const Data &B ) + { + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(A.rank() != B.rank(), std::invalid_argument, "A and B must have the same nominal shape"); + const int rank = A.rank(); + std::vector dimInfo(rank); + for (int d=0; d result(dimInfo); + return result; + } + //! Constructs a container suitable for storing the result of a matrix-vector multiply corresponding to the two provided containers. //! \see storeMatMat() //! \param A_MatData [in] - nominally (...,D1,D2)-dimensioned container, where D1,D2 correspond to matrix dimensions. @@ -1664,6 +2098,144 @@ namespace Intrepid2 { return Data(data,resultRank,resultExtents,resultVariationTypes); } + //! returns an MDRangePolicy over the underlying data extents (but with the logical shape). + template + enable_if_t<(rank!=1) && (rank!=7), Kokkos::MDRangePolicy> > + dataExtentRangePolicy() + { + using ExecutionSpace = typename DeviceType::execution_space; + Kokkos::Array startingOrdinals; + Kokkos::Array extents; + + for (int d=0; d>(startingOrdinals,extents); + return policy; + } + + //! returns an MDRangePolicy over the first six underlying data extents (but with the logical shape). + template + enable_if_t> > + dataExtentRangePolicy() + { + using ExecutionSpace = typename DeviceType::execution_space; + Kokkos::Array startingOrdinals; + Kokkos::Array extents; + + for (int d=0; d<6; d++) + { + startingOrdinals[d] = 0; + extents[d] = getDataExtent(d); + } + auto policy = Kokkos::MDRangePolicy>(startingOrdinals,extents); + return policy; + } + + template + inline + enable_if_t > + dataExtentRangePolicy() + { + using ExecutionSpace = typename DeviceType::execution_space; + Kokkos::RangePolicy policy(ExecutionSpace(),0,getDataExtent(0)); + return policy; + } + + //! Places the result of an in-place combination (e.g., entrywise sum) into this data container. + template + void storeInPlaceCombination(const Data &A, const Data &B, BinaryOperator binaryOperator) + { + using ExecutionSpace = typename DeviceType::execution_space; + +#ifdef INTREPID2_HAVE_DEBUG + // check nominal extents + for (int d=0; dextent_int(d), std::invalid_argument, "A, B, and this must agree on all nominal extents"); + INTREPID2_TEST_FOR_EXCEPTION(B.extent_int(d) != this->extent_int(d), std::invalid_argument, "A, B, and this must agree on all nominal extents"); + } + // TODO: add some checks that data extent of this suffices to accept combined A + B data. +#endif + + const bool this_constant = (this->getUnderlyingViewRank() == 1) && (this->getUnderlyingViewSize() == 1); + + // we special-case for constant output here; since the constant case is essentially all overhead, we want to avoid as much of the overhead of storeInPlaceCombination() as possible… + if (this_constant) + { + // constant data + Kokkos::RangePolicy policy(ExecutionSpace(),0,1); // just 1 entry + auto this_underlying = this->getUnderlyingView<1>(); + auto A_underlying = A.getUnderlyingView<1>(); + auto B_underlying = B.getUnderlyingView<1>(); + Kokkos::parallel_for("compute in-place", policy, + KOKKOS_LAMBDA (const int &i0) { + auto & result = this_underlying(0); + const auto & A_val = A_underlying(0); + const auto & B_val = B_underlying(0); + + result = binaryOperator(A_val,B_val); + }); + } + else + { + switch (rank_) + { + case 1: storeInPlaceCombination(A, B, binaryOperator); break; + case 2: storeInPlaceCombination(A, B, binaryOperator); break; + case 3: storeInPlaceCombination(A, B, binaryOperator); break; + case 4: storeInPlaceCombination(A, B, binaryOperator); break; + case 5: storeInPlaceCombination(A, B, binaryOperator); break; + case 6: storeInPlaceCombination(A, B, binaryOperator); break; + case 7: storeInPlaceCombination(A, B, binaryOperator); break; + default: + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "unhandled rank in switch"); + } + } + } + + //! stores the in-place (entrywise) sum, A .+ B, into this container. + void storeInPlaceSum(const Data &A, const Data &B) + { + auto sum = KOKKOS_LAMBDA(const DataScalar &a, const DataScalar &b) -> DataScalar + { + return a + b; + }; + storeInPlaceCombination(A, B, sum); + } + + //! stores the in-place (entrywise) product, A .* B, into this container. + void storeInPlaceProduct(const Data &A, const Data &B) + { + auto product = KOKKOS_LAMBDA(const DataScalar &a, const DataScalar &b) -> DataScalar + { + return a * b; + }; + storeInPlaceCombination(A, B, product); + } + + //! stores the in-place (entrywise) difference, A .- B, into this container. + void storeInPlaceDifference(const Data &A, const Data &B) + { + auto difference = KOKKOS_LAMBDA(const DataScalar &a, const DataScalar &b) -> DataScalar + { + return a - b; + }; + storeInPlaceCombination(A, B, difference); + } + + //! stores the in-place (entrywise) quotient, A ./ B, into this container. + void storeInPlaceQuotient(const Data &A, const Data &B) + { + auto quotient = KOKKOS_LAMBDA(const DataScalar &a, const DataScalar &b) -> DataScalar + { + return a / b; + }; + storeInPlaceCombination(A, B, quotient); + } + //! Places the result of a matrix-vector multiply corresponding to the two provided containers into this Data container. This Data container should have been constructed by a call to allocateMatVecResult(), or should match such a container in underlying data extent and variation types. void storeMatVec( const Data &matData, const Data &vecData ) { @@ -1684,7 +2256,7 @@ namespace Intrepid2 { auto policy = Kokkos::MDRangePolicy>({0,0,0},{getDataExtent(0),getDataExtent(1),matRows}); Kokkos::parallel_for("compute mat-vec", policy, KOKKOS_LAMBDA (const int &cellOrdinal, const int &pointOrdinal, const int &i) { - auto & val_i = thisData.getWritableEntry(cellOrdinal, pointOrdinal, i, 0, 0, 0, 0); + auto & val_i = thisData.getWritableEntry(cellOrdinal, pointOrdinal, i); val_i = 0; for (int j=0; j>({0,0},{getDataExtent(0),matRows}); Kokkos::parallel_for("compute mat-vec", policy, KOKKOS_LAMBDA (const int &vectorOrdinal, const int &i) { - auto & val_i = thisData.getWritableEntry(vectorOrdinal, i, 0, 0, 0, 0, 0); + auto & val_i = thisData.getWritableEntry(vectorOrdinal, i); val_i = 0; for (int j=0; j policy(0,matRows); Kokkos::parallel_for("compute mat-vec", policy, KOKKOS_LAMBDA (const int &i) { - auto & val_i = thisData.getWritableEntry(i, 0, 0, 0, 0, 0, 0); + auto & val_i = thisData.getWritableEntry(i); val_i = 0; for (int j=0; j >({0,0},{getDataExtent(0),getDataExtent(1)}); - if (underlyingMatchesNotional_) // receiving data object is completely expanded + if (underlyingMatchesLogical_) // receiving data object is completely expanded { Kokkos::parallel_for("compute mat-mat", policy, KOKKOS_LAMBDA (const int &cellOrdinal, const int &pointOrdinal) { @@ -1826,7 +2398,7 @@ namespace Intrepid2 { { for (int j=0; j 0; } - //! Returns the notional rank of the Data container. + //! Returns the logical rank of the Data container. KOKKOS_INLINE_FUNCTION unsigned rank() const { return rank_; } - /** \brief sets the notional extent in the specified dimension. If needed, the underlying data container is resized. - \param [in] d - the notional dimension in which the extent is to be changed + /** \brief sets the logical extent in the specified dimension. If needed, the underlying data container is resized. + \param [in] d - the logical dimension in which the extent is to be changed \param [in] newExtent - the new extent \note Not supported for dimensions in which the variation type is BLOCK_PLUS_DIAGONAL. \note If the variation type is MODULAR, the existing modulus must evenly divide the new extent; the underlying data structure will not be resized in this case. @@ -1923,11 +2495,11 @@ namespace Intrepid2 { extents_[d] = newExtent; } - //! Returns true if the underlying container has exactly the same rank and extents as the notional container. + //! Returns true if the underlying container has exactly the same rank and extents as the logical container. KOKKOS_INLINE_FUNCTION - bool underlyingMatchesNotional() const + bool underlyingMatchesLogical() const { - return underlyingMatchesNotional_; + return underlyingMatchesLogical_; } }; } diff --git a/packages/intrepid2/src/Shared/Intrepid2_PointToolsDef.hpp b/packages/intrepid2/src/Shared/Intrepid2_PointToolsDef.hpp index 1065bd240e09..86ff6541142e 100644 --- a/packages/intrepid2/src/Shared/Intrepid2_PointToolsDef.hpp +++ b/packages/intrepid2/src/Shared/Intrepid2_PointToolsDef.hpp @@ -48,6 +48,13 @@ #ifndef __INTREPID2_POINTTOOLS_DEF_HPP__ #define __INTREPID2_POINTTOOLS_DEF_HPP__ +#if defined(_MSC_VER) || defined(_WIN32) && defined(__ICL) +// M_PI, M_SQRT2, etc. are hidden in MSVC by #ifdef _USE_MATH_DEFINES + #ifndef _USE_MATH_DEFINES + #define _USE_MATH_DEFINES + #endif + #include +#endif namespace Intrepid2 { diff --git a/packages/intrepid2/src/Shared/Intrepid2_PolylibDef.hpp b/packages/intrepid2/src/Shared/Intrepid2_PolylibDef.hpp index 5c5ee9b1894e..b58d37c02316 100644 --- a/packages/intrepid2/src/Shared/Intrepid2_PolylibDef.hpp +++ b/packages/intrepid2/src/Shared/Intrepid2_PolylibDef.hpp @@ -95,6 +95,14 @@ #ifndef __INTREPID2_POLYLIB_DEF_HPP__ #define __INTREPID2_POLYLIB_DEF_HPP__ +#if defined(_MSC_VER) || defined(_WIN32) && defined(__ICL) +// M_PI, M_SQRT2, etc. are hidden in MSVC by #ifdef _USE_MATH_DEFINES + #ifndef _USE_MATH_DEFINES + #define _USE_MATH_DEFINES + #endif + #include +#endif + namespace Intrepid2 { // ----------------------------------------------------------------------- diff --git a/packages/intrepid2/src/Shared/Intrepid2_TensorArgumentIterator.hpp b/packages/intrepid2/src/Shared/Intrepid2_TensorArgumentIterator.hpp index c6754633e558..377dd2e649f5 100644 --- a/packages/intrepid2/src/Shared/Intrepid2_TensorArgumentIterator.hpp +++ b/packages/intrepid2/src/Shared/Intrepid2_TensorArgumentIterator.hpp @@ -91,6 +91,32 @@ namespace Intrepid2 } } + //! Basic constructor in which only the bounds of the tensor components are required. + TensorArgumentIterator(const std::vector tensorComponentBounds) + : + numTensorComponents_(tensorComponentBounds.size()) + { + for (ordinal_type r=0; r + KOKKOS_INLINE_FUNCTION + TensorArgumentIterator(const Kokkos::Array &tensorComponentBounds) + : + numTensorComponents_(rank) + { + for (ordinal_type r=0; r("A", cellCount, fieldCountA); + auto BView = getView("B", pointCount); + + auto ABView = getView("A+B", cellCount, fieldCountA, pointCount); + + Kokkos::Array extents {cellCount, fieldCount, pointCount}; + Kokkos::Array A_variation {GENERAL, MODULAR, CONSTANT}; + Kokkos::Array B_variation {CONSTANT, CONSTANT, GENERAL}; + + Data A(AView,extents,A_variation); + Data B(BView,extents,B_variation); + + // expected variation for A+B: + Kokkos::Array AB_variation {GENERAL, MODULAR, GENERAL}; + // expected Data object for A+B: + Data AB_expected(ABView,extents,AB_variation); + + auto AB_actual = Data::allocateInPlaceCombinationResult(A, B); + + TEST_EQUALITY(AB_actual.rank(), AB_expected.rank()); + for (int d=0; d actualCombinations {AB_dimInfoActual_LR, AB_dimInfoActual_RL}; + + for (const auto & dimInfoActual : actualCombinations) + { + TEST_EQUALITY(dimInfoActual.nominalExtent, AB_dimInfo.nominalExtent); + TEST_EQUALITY(dimInfoActual.dataExtent, AB_dimInfo.dataExtent); + TEST_EQUALITY(dimInfoActual.variationType, AB_dimInfo.variationType); + TEST_EQUALITY(dimInfoActual.variationModulus, AB_dimInfo.variationModulus); + } + } + // #pragma mark Data: EmptyDataMarkedAsInvalid /** \brief When Data containers are constructed without arguments, the isValid() method should return false. This test confirms that that is the case. */ @@ -91,7 +200,7 @@ namespace Kokkos::RangePolicy policy(0,1); // trivial policy: 1 entry Kokkos::parallel_for("set lastVal", policy, KOKKOS_LAMBDA (const int &i) { - auto & lastVal = data.getWritableEntry(numRows-1, numCols-1, 0, 0, 0, 0, 0); + auto & lastVal = data.getWritableEntry(numRows-1, numCols-1); lastVal = lastValueToSet; }); @@ -104,6 +213,90 @@ namespace testFloatingEquality2(expectedView, data, relTol, absTol, out, success); } +/** \brief Data has facilities for in-place combinations of logical data. Suppose you have two containers of nominal shape (C,P), one of which is constant across cells, the other of which is constant across points. To combine these (e.g., sum them together entrywise), you want a container that varies in both cells and points. The test below exercises the facility for allocation of the combined container. +*/ + + TEUCHOS_UNIT_TEST( Data, InPlaceSum ) + { + double relTol = 1e-13; + double absTol = 1e-13; + + // Use two Data objects A and B, each with nominal shape (5,9,15) -- (C,F,P), say. + // with A having variation types of GENERAL, MODULAR, and CONSTANT, + // and B having variation types of CONSTANT, CONSTANT, and GENERAL. + // Result should have variation types of GENERAL, MODULAR, GENERAL. + using DeviceType = DefaultTestDeviceType; + using Scalar = double; + + const int rank = 3; + const int cellCount = 5; + const int fieldCount = 9; + const int pointCount = 15; + + auto formula_A = [] (int cellOrdinal, int fieldOrdinal, int pointOrdinal) -> double + { + // varies modulus 3 in fieldOrdinal; constant pointwise + return double(cellOrdinal) + double(fieldOrdinal % 3); + }; + + auto formula_B = [] (int cellOrdinal, int fieldOrdinal, int pointOrdinal) -> double + { + // constant in cell, field; varies pointwise + return double(pointOrdinal); + }; + + auto sum = [] (const Scalar &a, const Scalar &b) -> Scalar + { + return a + b; + }; + + const int fieldCountA = 3; // A is modular in field dimension, with variation mod 3. + auto AView = getView("A", cellCount, fieldCountA); + auto BView = getView("B", pointCount); + + auto ABView = getView("A+B", cellCount, fieldCountA, pointCount); + + auto AViewHost = Kokkos::create_mirror(AView); + auto BViewHost = Kokkos::create_mirror(BView); + auto ABViewHost = Kokkos::create_mirror(ABView); + for (int cellOrdinal=0; cellOrdinal extents {cellCount, fieldCount, pointCount}; + Kokkos::Array A_variation {GENERAL, MODULAR, CONSTANT}; + Kokkos::Array B_variation {CONSTANT, CONSTANT, GENERAL}; + + Data A(AView,extents,A_variation); + Data B(BView,extents,B_variation); + + // expected variation for A+B: + Kokkos::Array AB_variation {GENERAL, MODULAR, GENERAL}; + // expected Data object for A+B: + Data AB_expected(ABView,extents,AB_variation); + + auto AB_actual = Data::allocateInPlaceCombinationResult(A, B); + + AB_actual.storeInPlaceSum(A, B); + + // test AB_actual equals AB_expected. (This will iterate over the nominal extents.) + testFloatingEquality3(AB_actual, AB_expected, relTol, absTol, out, success); + } + // #pragma mark Data: MatVec /** \brief Data provides matrix-vector multiplication support. This method checks correctness of the computed mat-vec for a particular case involving a 2x2 matrix and a 2x1 vector. */ @@ -209,7 +402,7 @@ namespace } // #pragma mark Data: MatMatExplicitIdentity_PDD -/** \brief Data provides matrix-matrix multiplication support. This method checks correctness of the computed mat-mat for several cases involving 3x3 identity matrices. Here, the notional dimensions (C,P,D,D) differ from the stored dimensions of (P,D,D). We test each possible transpose combination. +/** \brief Data provides matrix-matrix multiplication support. This method checks correctness of the computed mat-mat for several cases involving 3x3 identity matrices. Here, the logical dimensions (C,P,D,D) differ from the stored dimensions of (P,D,D). We test each possible transpose combination. */ TEUCHOS_UNIT_TEST( Data, MatMatExplicitIdentity_PDD ) // (P,D,D) underlying; notionally (C,P,D,D) { @@ -280,7 +473,7 @@ TEUCHOS_UNIT_TEST( Data, MatMatExplicitIdentity_PDD ) // (P,D,D) underlying; not } // #pragma mark Data: MatMatBlockPlusDiagonal -/** \brief Data provides matrix-matrix multiplication support. This method checks correctness of the computed mat-mat for a case involving one 3x3 matrix that has a 2x2 upper left block, and diagonal entry in the (3,3) position, and one 3x3 matrix that is entirely diagonal. Here, the notional dimensions (C,D,D) match the stored dimensions. +/** \brief Data provides matrix-matrix multiplication support. This method checks correctness of the computed mat-mat for a case involving one 3x3 matrix that has a 2x2 upper left block, and diagonal entry in the (3,3) position, and one 3x3 matrix that is entirely diagonal. Here, the logical dimensions (C,D,D) match the stored dimensions. */ TEUCHOS_UNIT_TEST( Data, MatMatBlockPlusDiagonal ) { diff --git a/packages/intrepid2/unit-test/performance/CMakeLists.txt b/packages/intrepid2/unit-test/performance/CMakeLists.txt index 190c839e5cb5..13a28a738f4c 100644 --- a/packages/intrepid2/unit-test/performance/CMakeLists.txt +++ b/packages/intrepid2/unit-test/performance/CMakeLists.txt @@ -1,2 +1,3 @@ +ADD_SUBDIRECTORY(DataCombination) ADD_SUBDIRECTORY(StructuredIntegration) diff --git a/packages/intrepid2/unit-test/performance/DataCombination/CMakeLists.txt b/packages/intrepid2/unit-test/performance/DataCombination/CMakeLists.txt new file mode 100644 index 000000000000..6a73853732ba --- /dev/null +++ b/packages/intrepid2/unit-test/performance/DataCombination/CMakeLists.txt @@ -0,0 +1,13 @@ +SET(SOURCES "") + +FILE(GLOB SOURCES *.cpp) + +SET(LIBRARIES intrepid2) + +TRIBITS_ADD_EXECUTABLE_AND_TEST( + DataCombinationPerformance + SOURCES ${SOURCES} + ARGS + NUM_MPI_PROCS 1 + ADD_DIR_TO_NAME + ) diff --git a/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp b/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp new file mode 100644 index 000000000000..b965e28b1cb4 --- /dev/null +++ b/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp @@ -0,0 +1,399 @@ +// @HEADER +// ************************************************************************ +// +// Intrepid2 Package +// Copyright (2007) Sandia Corporation +// +// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive +// license for use of this work by or on behalf of the U.S. Government. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Kyungjoo Kim (kyukim@sandia.gov), +// Mauro Perego (mperego@sandia.gov), or +// Nate Roberts (nvrober@sandia.gov) +// +// ************************************************************************ +// @HEADER + +/** \file DataCombinationPerformance.cpp + \brief Main for performance tests comparing performance when combining Intrepid2 Data objects (as sums and products) with the performance of (expanded) Kokkos View objects. + + Specifically, we consider a few use cases, each with nominal shape (C,P): + 1. Constant data. This case favors Data objects most heavily, since redundancy in the Views will be maximal. + 2. "Affine" data. This has shape (C,P), but only varies in the cell dimension. + 3. General data. There is no redundancy in the data. This case favors the View objects most heavily, and will maximally expose overhead from the Data implementation. + + We can define an ideal speedup as the relative reduction in flop count. Any write to a general data container will have an ideal speedup of 1.0 (no reduction in flop count); affine data will have + an ideal speedup of P; constant will have an ideal speedup of C*P. + + In addition to combinations of "like" Data (e.g., constant plus constant), we can also test combinations of "unlike" Data (e.g., constant plus affine). We expect these tests to have performance + characteristics somewhere between the corresponding "like" tests, but the ideal speedups for these tests will correspond to that of the more general container (e.g., constant plus affine will have + the same ideal speedup as affine plus affine: P). + + We fix the cell count at 16,000, and allow the point count to vary. We expect (and observe) that constant/constant operations will not come close to the ideal speedup (because the overhead + surrounding the operation dwarfs the single-flop cost); affine/affine operations fare somewhat better, with some non-negligible fraction of the ideal speedup; general/general operations are quite + close to the 1.0 ideal speedup. The mixed operations that result in general containers (e.g., general/constant) often do substantially better than the ideal speedup, thanks to enhanced data locality. + + After measuring timings, we also confirm that the two algorithms agree on the results. + */ + +#include "Teuchos_GlobalMPISession.hpp" + +#include "Teuchos_StackedTimer.hpp" +#include "Teuchos_TimeMonitor.hpp" +#include "Teuchos_DefaultComm.hpp" + +#include "Kokkos_Core.hpp" + +#include "Intrepid2_Data.hpp" +#include "Intrepid2_TestUtils.hpp" +#include "Intrepid2_Types.hpp" + +enum CaseChoice +{ + Constant, + Affine, + General +}; + +std::string to_string(CaseChoice choice) +{ + switch (choice) { + case Constant: return "Constant"; + case Affine: return "Affine"; + case General: return "General"; + + default: return "Unknown CaseChoice"; + } +} + +using namespace Intrepid2; + +static const int NUM_CELLS = 16000; + +template< typename Scalar, typename DeviceType > +inline +Data getData(CaseChoice caseChoice, const int numPoints, const double baseValue) +{ + using ExecutionSpace = typename DeviceType::execution_space; + const int numCells = NUM_CELLS; + Kokkos::Array extents {numCells, numPoints}; + Kokkos::Array variationTypes {GENERAL,GENERAL}; + + switch (caseChoice) { + case Constant: + return Data(baseValue,extents); + case Affine: + { + // (C,P); varies in C dimension + variationTypes[1] = CONSTANT; + Kokkos::View cellView("affine case - underlying view",numCells); + Kokkos::RangePolicy policy(ExecutionSpace(), 0, numCells); + Kokkos::parallel_for("initialize underlying view data", policy, + KOKKOS_LAMBDA (const int &i0) { + cellView(i0) = i0 * baseValue; + }); + return Data(cellView,extents,variationTypes); + } + case General: + { + // (C,P); varies in C and P dimensions + variationTypes[1] = GENERAL; + Kokkos::View cellView("affine case - underlying view",numCells,numPoints); + Kokkos::MDRangePolicy> policy({0,0},{numCells,numPoints}); + Kokkos::parallel_for("initialize underlying view data", policy, + KOKKOS_LAMBDA (const int &i0, const int &i1) { + cellView(i0,i1) = i0 * baseValue + i1; + }); + return Data(cellView,extents,variationTypes); + } + default: + return Data(); + } +} + +double idealSpeedup(CaseChoice caseChoice, const int numPoints) +{ + switch (caseChoice) { + case Constant: + return NUM_CELLS * numPoints; + case Affine: + return numPoints; + case General: + return 1.0; + default: + return -1.0; + } +} + +template< typename Scalar, typename DeviceType > +Kokkos::View allocateView(const int numPoints) +{ + Kokkos::View view("DataCombinationPerformance - View", NUM_CELLS, numPoints); + return view; +} + +template< typename Scalar, typename DeviceType > +inline +void fillView(CaseChoice caseChoice, Kokkos::View view, const double baseValue) +{ + using ExecutionSpace = typename DeviceType::execution_space; + + switch (caseChoice) { + case Constant: + Kokkos::deep_copy(view, baseValue); + break; + case Affine: + { + Kokkos::MDRangePolicy> policy({0,0},{view.extent_int(0),view.extent_int(1)}); + // (C,P); varies in C dimension + Kokkos::parallel_for("initialize underlying view data", policy, + KOKKOS_LAMBDA (const int &i0, const int &i1) { + view(i0,i1) = i0 * baseValue; + }); + } + break; + case General: + { + Kokkos::MDRangePolicy> policy({0,0},{view.extent_int(0),view.extent_int(1)}); + // (C,P); varies in C and P dimensions + Kokkos::parallel_for("initialize underlying view data", policy, + KOKKOS_LAMBDA (const int &i0, const int &i1) { + view(i0,i1) = i0 * baseValue + i1; + }); + } + break; + default: + break; + } + ExecutionSpace().fence(); +} + +template< typename Scalar, typename DeviceType > +void sumViews(Kokkos::View resultView, + Kokkos::View view1, Kokkos::View view2) +{ + using ExecutionSpace = typename DeviceType::execution_space; + Kokkos::MDRangePolicy> policy({0,0},{resultView.extent_int(0),resultView.extent_int(1)}); + + Kokkos::parallel_for("initialize underlying view data", policy, + KOKKOS_LAMBDA (const int &i0, const int &i1) { + resultView(i0,i1) = view1(i0,i1) + view2(i0,i1); + }); +} + +int main( int argc, char* argv[] ) +{ + // Note that the dtor for GlobalMPISession will call Kokkos::finalize_all() but does not call Kokkos::initialize()... + Teuchos::GlobalMPISession mpiSession(&argc, &argv); + Kokkos::initialize(argc,argv); + + using std::cout; + using std::endl; + using std::string; + using std::vector; + + bool success = true; + + { + vector allCaseChoices {Constant, Affine, General}; + + Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options + + string caseChoiceString = "All"; // alternatives: Standard, NonAffineTensor, AffineTensor, Uniform + + int pointCountFixed = -1; + int pointCountMin = 16; + int pointCountMax = 1024; + + cmdp.setOption("case", &caseChoiceString, "Options: All, Constant, Affine, General"); + cmdp.setOption("pointCount", &pointCountFixed, "Single point count to run with"); + cmdp.setOption("minPointCount", &pointCountMin, "Starting point count (will double until max count is reached)"); + cmdp.setOption("maxPointCount", &pointCountMax, "Maximum point count"); + + if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) + { + #ifdef HAVE_MPI + MPI_Finalize(); + #endif + return -1; + } + + vector caseChoices; + if (caseChoiceString == "All") + { + caseChoices = allCaseChoices; + } + else if (caseChoiceString == "Constant") + { + caseChoices = vector{Constant}; + } + else if (caseChoiceString == "Affine") + { + caseChoices = vector{Affine}; + } + else if (caseChoiceString == "General") + { + caseChoices = vector{General}; + } + else + { + cout << "Unrecognized case choice: " << caseChoiceString << endl; +#ifdef HAVE_MPI + MPI_Finalize(); +#endif + return -1; + } + + if (pointCountFixed > 0) + { + pointCountMin = pointCountFixed; + pointCountMax = pointCountFixed; + } + + using Scalar = double; + using DeviceType = Kokkos::DefaultExecutionSpace::device_type; + + using DataType = Data; + + const int charWidth = 15; + using std::vector; + using std::map; + using std::pair; + using std::make_pair; + using std::tuple; + using std::cout; + using std::endl; + using std::setw; + using std::scientific; + using std::fixed; + + const double absTol = 1e-15, relTol = 1e-15; + + for (CaseChoice caseChoice1 : caseChoices) + { + for (CaseChoice caseChoice2 : caseChoices) + { +// { +// // DEBUGGING: +// if ((caseChoice1 != General) && (caseChoice2 == General)) +// { +// cout << "Set breakpoint here.\n"; +// } +// } + + // since constant takes so little time (and measurement is therefore noisy), we do a bunch of measurements and use their average + const bool bothConstant = (caseChoice1 == Constant) && (caseChoice2 == Constant); + const int numMeasurements = bothConstant ? 1000 : 1; + + cout << "\n\n*******************************************\n"; + cout << "****** " << setw(12) << to_string(caseChoice1) << "/" << to_string(caseChoice2) << setw(14) << " ******\n"; + cout << "*******************************************\n"; + for (int pointCount=pointCountMin; pointCount<=pointCountMax; pointCount *= 2) + { + const double baseValue1 = M_PI; + const double baseValue2 = 1.0; + + Data result; + auto dataTimer = Teuchos::TimeMonitor::getNewTimer("Data sum"); + for (int i=0; i(caseChoice1, pointCount, baseValue1); + auto data2 = getData(caseChoice2, pointCount, baseValue2); + + result = DataType::allocateInPlaceCombinationResult(data1, data2); + + DeviceType::execution_space().fence(); + dataTimer->start(); + result.storeInPlaceSum(data1, data2); + DeviceType::execution_space().fence(); + dataTimer->stop(); + } + double dataElapsedTimeSeconds = dataTimer->totalElapsedTime() / numMeasurements; + + cout << "Point count: " << setw(charWidth) << pointCount << endl; + cout << "Time (sum - data): " << setw(charWidth) << std::setprecision(2) << scientific << dataElapsedTimeSeconds << endl; + + dataTimer->reset(); + + auto viewTimer = Teuchos::TimeMonitor::getNewTimer("View sum"); + auto view1 = allocateView(pointCount); + auto view2 = allocateView(pointCount); + auto resultView = allocateView(pointCount); + + fillView(caseChoice1, view1, baseValue1); + fillView(caseChoice2, view2, baseValue2); + + DeviceType::execution_space().fence(); + viewTimer->start(); + sumViews(resultView, view1, view2); + DeviceType::execution_space().fence(); + viewTimer->stop(); + double viewElapsedTimeSeconds = viewTimer->totalElapsedTime(); + cout << "Time (sum - view): " << setw(charWidth) << std::setprecision(2) << scientific << viewElapsedTimeSeconds << endl; + + viewTimer->reset(); + + const double maxSpeedup = std::min(idealSpeedup(caseChoice1, pointCount),idealSpeedup(caseChoice2, pointCount)); + const double actualSpeedup = viewElapsedTimeSeconds / dataElapsedTimeSeconds; + const double percentage = actualSpeedup / maxSpeedup * 100.0; + cout << "Ideal speedup: " << setw(charWidth) << std::setprecision(2) << scientific << maxSpeedup << endl; + cout << "Actual speedup: " << setw(charWidth) << std::setprecision(2) << scientific << actualSpeedup << endl; + cout << "Percentage of ideal: " << setw(charWidth) << std::setprecision(2) << fixed << percentage << "%" << endl; + cout << endl; + + // to optimize for the case where the test passes, we output to a Teuchos::oblackholestream first. + // if the test fails, we repeat the comparison to std::cout. + Teuchos::oblackholestream outNothing; + Teuchos::basic_FancyOStream out(Teuchos::rcp(&outNothing,false)); + bool localSuccess = true; + testFloatingEquality2(resultView, result, relTol, absTol, out, localSuccess); + + if (!localSuccess) + { + cout << "Error: results do not match. Comparison details:\n"; + + Teuchos::oblackholestream outNothing; + Teuchos::basic_FancyOStream out(Teuchos::rcp(&outNothing,false)); + + Teuchos::basic_FancyOStream std_out(Teuchos::rcp(&std::cout,false)); + testFloatingEquality2(resultView, result, relTol, absTol, std_out, localSuccess); + + success = false; + } + } + } + } + } + + if (success) + return 0; + else + return -1; +} diff --git a/packages/kokkos-kernels/CHANGELOG.md b/packages/kokkos-kernels/CHANGELOG.md index 911bb3219754..4326f3ee5f19 100644 --- a/packages/kokkos-kernels/CHANGELOG.md +++ b/packages/kokkos-kernels/CHANGELOG.md @@ -1,39 +1,50 @@ # Change Log +## [3.4.01](https://github.com/kokkos/kokkos-kernels/tree/3.4.01) (2021-05-19) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.4.00...3.4.01) + +**Fixed Bugs:** +- Windows: Fixes for Windows [\#981](https://github.com/kokkos/kokkos-kernels/pull/981) +- Sycl: ArithTraits fixes for Sycl [\#959](https://github.com/kokkos/kokkos-kernels/pull/959) +- Sparse: Added code to allow KokkosKernels coloring to accept partial colorings [\#938](https://github.com/kokkos/kokkos-kernels/pull/938) +- Sparse: Include sorting within spiluk [\#972](https://github.com/kokkos/kokkos-kernels/pull/972) +- Sparse: Fix CrsMatrix raw pointer constructor [\#971](https://github.com/kokkos/kokkos-kernels/pull/971) +- Sparse: Fix spmv Serial beta==-1 code path [\#947](https://github.com/kokkos/kokkos-kernels/pull/947) + ## [3.4.00](https://github.com/kokkos/kokkos-kernels/tree/3.4.00) (2021-04-25) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.3.01...3.4.00) **Features:** -- SYCL: adding ETI and CMake logic for SYCL backend [\#924](https://github.com/kokkos/kokkos/pull/924) +- SYCL: adding ETI and CMake logic for SYCL backend [\#924](https://github.com/kokkos/kokkos-kernels/pull/924) **Implemented enhancements Algorithms and Archs:** -- Two-stage GS: add damping factors [\#921](https://github.com/kokkos/kokkos/pull/921) -- Supernodal SpTRSV, improve symbolic performance [\#899](https://github.com/kokkos/kokkos/pull/899) -- Add MKL SpMV wrapper [\#895](https://github.com/kokkos/kokkos/pull/895) -- Serial code path for spmv [\#893](https://github.com/kokkos/kokkos/pull/893) +- Two-stage GS: add damping factors [\#921](https://github.com/kokkos/kokkos-kernels/pull/921) +- Supernodal SpTRSV, improve symbolic performance [\#899](https://github.com/kokkos/kokkos-kernels/pull/899) +- Add MKL SpMV wrapper [\#895](https://github.com/kokkos/kokkos-kernels/pull/895) +- Serial code path for spmv [\#893](https://github.com/kokkos/kokkos-kernels/pull/893) **Implemented enhancements BuildSystem:** -- Cmake: Update ArmPL support [\#901](https://github.com/kokkos/kokkos/pull/901) -- Cmake: Add ARMPL TPL support [\#880](https://github.com/kokkos/kokkos/pull/880) -- IntelClang guarding __assume_aligned with !defined(__clang__) [\#878](https://github.com/kokkos/kokkos/pull/878) +- Cmake: Update ArmPL support [\#901](https://github.com/kokkos/kokkos-kernels/pull/901) +- Cmake: Add ARMPL TPL support [\#880](https://github.com/kokkos/kokkos-kernels/pull/880) +- IntelClang guarding __assume_aligned with !defined(__clang__) [\#878](https://github.com/kokkos/kokkos-kernels/pull/878) **Implemented enhancements Other:** -- Add static_assert/throw in batched eigendecomp [\#931](https://github.com/kokkos/kokkos/pull/931) -- Workaround using new/delete in kernel code [\#925](https://github.com/kokkos/kokkos/pull/925) -- Blas perf_test updates [\#892](https://github.com/kokkos/kokkos/pull/892) +- Add static_assert/throw in batched eigendecomp [\#931](https://github.com/kokkos/kokkos-kernels/pull/931) +- Workaround using new/delete in kernel code [\#925](https://github.com/kokkos/kokkos-kernels/pull/925) +- Blas perf_test updates [\#892](https://github.com/kokkos/kokkos-kernels/pull/892) **Fixed bugs:** -- Fix ctor CrsMat mirror with CrsGraph mirror [\#918](https://github.com/kokkos/kokkos/pull/918) -- Fix nrm1, removed cublas nrminf, improved blas tests [\#915](https://github.com/kokkos/kokkos/pull/915) -- Fix and testing coverage mainly in graph coarsening [\#910](https://github.com/kokkos/kokkos/pull/910) -- Fix KokkosSparse for nightly test failure [\#898](https://github.com/kokkos/kokkos/pull/898) -- Fix view types across ternary operator [\#894](https://github.com/kokkos/kokkos/pull/894) -- Make work_view_t typedef consistent [\#885](https://github.com/kokkos/kokkos/pull/885) -- Fix supernodal SpTRSV build with serial+openmp+cuda [\#884](https://github.com/kokkos/kokkos/pull/884) -- Construct SpGEMM C with correct ncols [\#883](https://github.com/kokkos/kokkos/pull/883) -- Matrix Converter: fixing issue with deallocation after Kokkos::fininalize [\#882](https://github.com/kokkos/kokkos/pull/882) -- Fix >1024 team size error in sort_crs_* [\#872](https://github.com/kokkos/kokkos/pull/872) -- Fixing seg fault with empty matrix in kspiluk [\#871](https://github.com/kokkos/kokkos/pull/871) +- Fix ctor CrsMat mirror with CrsGraph mirror [\#918](https://github.com/kokkos/kokkos-kernels/pull/918) +- Fix nrm1, removed cublas nrminf, improved blas tests [\#915](https://github.com/kokkos/kokkos-kernels/pull/915) +- Fix and testing coverage mainly in graph coarsening [\#910](https://github.com/kokkos/kokkos-kernels/pull/910) +- Fix KokkosSparse for nightly test failure [\#898](https://github.com/kokkos/kokkos-kernels/pull/898) +- Fix view types across ternary operator [\#894](https://github.com/kokkos/kokkos-kernels/pull/894) +- Make work_view_t typedef consistent [\#885](https://github.com/kokkos/kokkos-kernels/pull/885) +- Fix supernodal SpTRSV build with serial+openmp+cuda [\#884](https://github.com/kokkos/kokkos-kernels/pull/884) +- Construct SpGEMM C with correct ncols [\#883](https://github.com/kokkos/kokkos-kernels/pull/883) +- Matrix Converter: fixing issue with deallocation after Kokkos::fininalize [\#882](https://github.com/kokkos/kokkos-kernels/pull/882) +- Fix >1024 team size error in sort_crs_* [\#872](https://github.com/kokkos/kokkos-kernels/pull/872) +- Fixing seg fault with empty matrix in kspiluk [\#871](https://github.com/kokkos/kokkos-kernels/pull/871) ## [3.3.01](https://github.com/kokkos/kokkos-kernels/tree/3.3.01) (2021-01-18) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.3.00...3.3.01) diff --git a/packages/kokkos-kernels/CMakeLists.txt b/packages/kokkos-kernels/CMakeLists.txt index 1f698db6683a..88292bdd0c06 100644 --- a/packages/kokkos-kernels/CMakeLists.txt +++ b/packages/kokkos-kernels/CMakeLists.txt @@ -25,7 +25,7 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) SET(KokkosKernels_VERSION_MINOR 4) - SET(KokkosKernels_VERSION_PATCH 0) + SET(KokkosKernels_VERSION_PATCH 01) ENDIF() IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") diff --git a/packages/kokkos-kernels/master_history.txt b/packages/kokkos-kernels/master_history.txt index a113e3619f79..5c63ba453d97 100644 --- a/packages/kokkos-kernels/master_history.txt +++ b/packages/kokkos-kernels/master_history.txt @@ -12,3 +12,5 @@ tag: 3.1.01 date: 05/04/2020 master: 43773523 release: 6fce7502 tag: 3.2.00 date: 08/19/2020 master: 07a60bcc release: ea3f2b77 tag: 3.3.00 date: 12/16/2020 master: 42defc56 release: e5279e55 tag: 3.3.01 date: 01/18/2021 master: f64b1c57 release: 4e1cc00b +tag: 3.4.00 date: 04/26/2021 master: fe439b21 release: d3c33910 +tag: 3.4.01 date: 05/20/2021 master: 564dccb3 release: 4c62eb86 diff --git a/packages/kokkos-kernels/src/Kokkos_ArithTraits.hpp b/packages/kokkos-kernels/src/Kokkos_ArithTraits.hpp index f96ffc49c39c..17d3f568fe10 100644 --- a/packages/kokkos-kernels/src/Kokkos_ArithTraits.hpp +++ b/packages/kokkos-kernels/src/Kokkos_ArithTraits.hpp @@ -729,7 +729,13 @@ class ArithTraits { return Kokkos::Experimental::cast_to_half(::sqrt (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - return Kokkos::Experimental::cast_to_half(::cbrt (Kokkos::Experimental::cast_from_half(x))); + return Kokkos::Experimental::cast_to_half( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::cbrt(Kokkos::Experimental::cast_from_half(x)) +#else + ::cbrt(Kokkos::Experimental::cast_from_half(x)) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return Kokkos::Experimental::cast_to_half(::exp (Kokkos::Experimental::cast_from_half(x))); @@ -762,10 +768,22 @@ class ArithTraits { return Kokkos::Experimental::cast_to_half(::asin (Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - return Kokkos::Experimental::cast_to_half(::acos (Kokkos::Experimental::cast_from_half(x))); + return Kokkos::Experimental::cast_to_half( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::acos(Kokkos::Experimental::cast_from_half(x)) +#else + ::acos(Kokkos::Experimental::cast_from_half(x)) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - return Kokkos::Experimental::cast_to_half(::atan (Kokkos::Experimental::cast_from_half(x))); + return Kokkos::Experimental::cast_to_half( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::atan(Kokkos::Experimental::cast_from_half(x)) +#else + ::atan(Kokkos::Experimental::cast_from_half(x)) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () { //return ::pow(2, -KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS); @@ -858,16 +876,16 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const float x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; -#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using sycl::isinf +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + using sycl::isinf; #endif return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const float x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; -#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using sycl::isnan +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + using sycl::isnan; #endif return isnan (x); } @@ -899,10 +917,18 @@ class ArithTraits { return ::pow (x, y); } static KOKKOS_FORCEINLINE_FUNCTION float sqrt (const float x) { - return ::sqrt (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::sqrt(x); +#else + return ::sqrt(x); +#endif } static KOKKOS_FORCEINLINE_FUNCTION float cbrt (const float x) { - return ::cbrt (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::cbrt(x); +#else + return ::cbrt(x); +#endif } static KOKKOS_FORCEINLINE_FUNCTION float exp (const float x) { return ::exp (x); @@ -938,7 +964,11 @@ class ArithTraits { return ::acos (x); } static KOKKOS_FORCEINLINE_FUNCTION float atan (const float x) { - return ::atan (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::atan(x); +#else + return ::atan(x); +#endif } static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () { return FLT_EPSILON; @@ -1039,8 +1069,8 @@ class ArithTraits > { static bool isInf(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; -#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using sycl::isinf +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + using sycl::isinf; #endif return isinf (real (x)) || isinf (imag (x)); } @@ -1062,8 +1092,8 @@ class ArithTraits > { static bool isNan(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; -#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using sycl::isnan +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + using sycl::isnan; #endif return isnan (real (x)) || isnan (imag (x)); } @@ -1130,7 +1160,11 @@ class ArithTraits > { return std::sqrt (x); } static std::complex cbrt (const std::complex& x) { - return std::cbrt (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::cbrt(x); +#else + return ::cbrt(x); +#endif } static std::complex exp (const std::complex& x) { return std::exp (x); @@ -1166,7 +1200,12 @@ class ArithTraits > { return std::acos (x); } static std::complex atan (const std::complex& x) { - return std::atan (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::atan; +#else + using std::atan; +#endif + return atan(x); } static std::complex nan () { const mag_type mag_nan = ArithTraits::nan (); @@ -1251,17 +1290,17 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; - #elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) using sycl::isinf; - #endif +#endif return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; - #elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) using sycl::isnan; - #endif +#endif return isnan (x); } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) { @@ -1292,10 +1331,18 @@ class ArithTraits { return ::pow (x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { - return ::sqrt (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::sqrt(x); +#else + return ::sqrt(x); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - return ::cbrt (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::cbrt(x); +#else + return ::cbrt(x); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return ::exp (x); @@ -1331,7 +1378,11 @@ class ArithTraits { return ::acos (x); } static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - return ::atan (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::atan(x); +#else + return ::atan(x); +#endif } static KOKKOS_FORCEINLINE_FUNCTION val_type nan () { #if defined(__CUDA_ARCH__) @@ -2224,10 +2275,22 @@ class ArithTraits { // some reasonable value (like 0), though this might be more // expensive than the absolute value interpreted using the ternary // operator. - return static_cast ( ::sqrt (static_cast (abs (x)))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::sqrt(static_cast(abs(x))) +#else + ::sqrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - return static_cast ( ::cbrt (static_cast (abs (x)))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::cbrt(static_cast(abs(x))) +#else + ::cbrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (abs (x)))); @@ -2346,10 +2409,22 @@ class ArithTraits { return intPowSigned (x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) { - return static_cast ( ::sqrt (static_cast (abs (x)))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::sqrt(static_cast(abs(x))) +#else + ::sqrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - return static_cast ( ::cbrt (static_cast (abs (x)))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::cbrt(static_cast(abs(x))) +#else + ::cbrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (abs (x)))); @@ -2471,10 +2546,22 @@ class ArithTraits { // This will result in no loss of accuracy, though it might be // more expensive than it should, if we were clever about using // bit operations. - return static_cast ( ::sqrt (static_cast (x))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::sqrt(static_cast(abs(x))) +#else + ::sqrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - return static_cast ( ::cbrt (static_cast (x))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::cbrt(static_cast(abs(x))) +#else + ::cbrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (x))); @@ -2604,10 +2691,22 @@ class ArithTraits { // This will result in no loss of accuracy, though it might be // more expensive than it should, if we were clever about using // bit operations. - return static_cast ( ::sqrt (static_cast (abs (x)))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::sqrt(static_cast(abs(x))) +#else + ::sqrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - return static_cast ( ::cbrt (static_cast (abs (x)))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::cbrt(static_cast(abs(x))) +#else + ::cbrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (abs (x)))); @@ -2735,10 +2834,22 @@ class ArithTraits { // This will result in no loss of accuracy, though it might be // more expensive than it should, if we were clever about using // bit operations. - return static_cast ( ::sqrt (static_cast (x))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::sqrt(static_cast(abs(x))) +#else + ::sqrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - return static_cast ( ::cbrt (static_cast (x))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::cbrt(static_cast(abs(x))) +#else + ::cbrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (x))); @@ -2874,10 +2985,22 @@ class ArithTraits { // This will result in no loss of accuracy, though it might be // more expensive than it should, if we were clever about using // bit operations. - return static_cast ( ::sqrt (static_cast (abs (x)))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::sqrt(static_cast(abs(x))) +#else + ::sqrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - return static_cast ( ::cbrt (static_cast (abs (x)))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::cbrt(static_cast(abs(x))) +#else + ::cbrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (abs (x)))); @@ -3005,10 +3128,22 @@ class ArithTraits { // This will result in no loss of accuracy, though it might be // more expensive than it should, if we were clever about using // bit operations. - return static_cast ( ::sqrt (static_cast (x))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::sqrt(static_cast(abs(x))) +#else + ::sqrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - return static_cast ( ::cbrt (static_cast (x))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::cbrt(static_cast(abs(x))) +#else + ::cbrt(static_cast(abs(x))) +#endif + ); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { return static_cast ( ::exp (static_cast (x))); @@ -3272,7 +3407,13 @@ class ArithTraits { using std::cbrtl; return static_cast ( ::cbrtl (static_cast (x))); #else - return static_cast ( ::cbrt (static_cast (x))); + return static_cast( +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + sycl::cbrt(static_cast(abs(x))) +#else + ::cbrt(static_cast(abs(x))) +#endif + ); #endif } static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { @@ -3406,7 +3547,7 @@ class ArithTraits { // 64-bit integer type exactly. However, CUDA does not implement // long double for device functions. return static_cast ( sqrt (static_cast (abs (x)))); -#else +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) // Casting from a 64-bit integer type to double does result in a // loss of accuracy. However, it gives us a good first // approximation. For very large numbers, we may lose some @@ -3417,6 +3558,8 @@ class ArithTraits { // which it has to be, so we don't have to check) to ensure // correctness. It actually should suffice to check numbers // within 1 of the result. + return static_cast(sycl::sqrt(static_cast(abs(x)))); +#else return static_cast ( ::sqrt (static_cast (abs (x)))); #endif } @@ -3425,6 +3568,8 @@ class ArithTraits { using std::cbrtl; using std::abs; return static_cast ( cbrtl (static_cast (abs (x)))); +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + return static_cast(sycl::cbrt(static_cast(abs(x)))); #else return static_cast ( ::cbrt (static_cast (abs (x)))); #endif @@ -3555,6 +3700,8 @@ class ArithTraits { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::sqrt; return static_cast ( sqrt (static_cast (x))); +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + return static_cast(sycl::sqrt(static_cast(x))); #else return static_cast ( ::sqrt (static_cast (x))); #endif @@ -3563,6 +3710,8 @@ class ArithTraits { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::cbrtl; return static_cast ( cbrtl (static_cast (x))); +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + return static_cast(sycl::cbrt(static_cast(x))); #else return static_cast ( ::cbrt (static_cast (x))); #endif @@ -3700,10 +3849,18 @@ struct ArithTraits return ::pow(x,y); } static inline val_type sqrt (const val_type& x) { - return ::sqrt (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::sqrt(x); +#else + return ::sqrt(x); +#endif } static inline val_type cbrt (const val_type& x) { - return ::cbrt (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::cbrt(x); +#else + return ::cbrt(x); +#endif } static inline val_type exp (const val_type& x) { return ::exp (x); @@ -3740,7 +3897,11 @@ struct ArithTraits return ::acos (x); } static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - return ::atan (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::atan(x); +#else + return ::atan(x); +#endif } static inline val_type nan () { return val_type::_nan; @@ -3801,7 +3962,11 @@ struct ArithTraits } static std::string name () { return "dd_real"; } static val_type squareroot (const val_type& x) { - return ::sqrt (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::sqrt(x); +#else + return ::sqrt(x); +#endif } }; @@ -3852,10 +4017,18 @@ struct ArithTraits return ::pow (x, y); } static inline val_type sqrt (const val_type& x) { - return ::sqrt (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::sqrt(x); +#else + return ::sqrt(x); +#endif } static inline val_type cbrt (const val_type& x) { - return ::cbrt (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::cbrt(x); +#else + return ::cbrt(x); +#endif } static inline val_type exp (const val_type& x) { return ::exp (x); @@ -3892,7 +4065,11 @@ struct ArithTraits return ::acos (x); } static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - return ::atan (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::atan(x); +#else + return ::atan(x); +#endif } static inline val_type nan () { return val_type::_nan; @@ -3957,7 +4134,11 @@ struct ArithTraits } static std::string name () { return "qd_real"; } static val_type squareroot (const val_type& x) { - return ::sqrt (x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::sqrt(x); +#else + return ::sqrt(x); +#endif } }; #endif // HAVE_KOKKOS_QD diff --git a/packages/kokkos-kernels/src/blas/KokkosBlas3_gemm.hpp b/packages/kokkos-kernels/src/blas/KokkosBlas3_gemm.hpp index 9005c3a6b549..d06c714e19b9 100644 --- a/packages/kokkos-kernels/src/blas/KokkosBlas3_gemm.hpp +++ b/packages/kokkos-kernels/src/blas/KokkosBlas3_gemm.hpp @@ -48,73 +48,12 @@ #include #include -#include #include #include #include namespace KokkosBlas { -namespace Impl { - // Special codepath for when B/C have 1 column: use GEMV (matrix-vector) instead. - // GEMV performs better than tiled GEMM in this case. - // - // Returns true if the criteria are met and GEMV was run, false otherwise. - // - // This case must be intercepted here rather than impl in order to call TPL - // GEMV instead of TPL GEMM. This codepath was measured to be profitable with cuBLAS. - template - bool - gemv_based_gemm - (const char transA[], - const char transB[], - typename AViewType::const_value_type& alpha, - const AViewType& A, - const BViewType& B, - typename CViewType::const_value_type& beta, - const CViewType& C, - typename std::enable_if< - !std::is_same::value && - !std::is_same::value>::type* = nullptr) - { - if(toupper(transA[0]) == 'N' && toupper(transB[0]) == 'N' && B.extent(1) == size_t(1)) - { - // since B/C both have a single column and are not LayoutStride, - // can create a raw contiguous rank-1 vector from them rather than using subview. - Kokkos::View> Bvec(B.data(), B.extent(0)); - Kokkos::View> Cvec(C.data(), C.extent(0)); - KokkosBlas::gemv("N", alpha, A, Bvec, beta, Cvec); - return true; - } - return false; - } - - // Don't attempt to call GEMV with LayoutStride vectors. - // GEMV is not ETI'd for this case, so there would be undefined symbol errors in tests. - template - bool - gemv_based_gemm - (const char transA[], - const char transB[], - typename AViewType::const_value_type& alpha, - const AViewType& A, - const BViewType& B, - typename CViewType::const_value_type& beta, - const CViewType& C, - typename std::enable_if< - std::is_same::value || - std::is_same::value>::type* = nullptr) - { - return false; - } -} - /// \brief Dense matrix-matrix multiply: C = beta*C + alpha*op(A)*op(B). /// /// \tparam AViewType Input matrix, as a 2-D Kokkos::View @@ -203,10 +142,6 @@ gemm (const char transA[], if((A.extent(0) == 0) || (A.extent(1) == 0) || (C.extent(1) == 0)) return; - // Check if gemv code path is allowed and profitable, and if so run it. - if(Impl::gemv_based_gemm(transA, transB, alpha, A, B, beta, C)) - return; - // Minimize the number of Impl::GEMV instantiations, by // standardizing on particular View specializations for its template // parameters. diff --git a/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_impl.hpp index 4fa19959cdc1..db5bc9fbca33 100644 --- a/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -46,7 +46,6 @@ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" -#include "KokkosKernels_ExecSpaceUtils.hpp" #include "Kokkos_ArithTraits.hpp" namespace KokkosBlas { @@ -96,17 +95,17 @@ struct SingleLevelNontransposeGEMV { KOKKOS_INLINE_FUNCTION void operator () (const IndexType& i) const { - using y_value_type = typename YViewType::non_const_value_type; + using y_value_type = typename std::decay::type; y_value_type y_i; if (betaPreset == 0) { - y_i = Kokkos::ArithTraits::zero (); + y_i = Kokkos::Details::ArithTraits::zero (); } else if (betaPreset == 1) { - y_i = y_(i); + y_i = y_[i]; } else { // beta_ != 0 and beta != 1 - y_i = beta_ * y_(i); + y_i = beta_ * y_[i]; } const IndexType numCols = A_.extent(1); @@ -124,7 +123,7 @@ struct SingleLevelNontransposeGEMV { } } - y_(i) = y_i; + y_[i] = y_i; } private: @@ -212,10 +211,11 @@ struct SingleLevelTransposeGEMV { for (IndexType j = 0; j < value_count; ++j) { // Sum into initial y_ values; use beta as a pre-multiplier if nonzero. - if(betaPreset == 0) - y_(j) = y_result[j]; - else - y_(j) = beta_ * y_(j) + y_result[j]; + const y_value_type y_j = + beta_ == ArithTraits::zero () ? + ArithTraits::zero () : + beta_ * y_[j]; + y_[j] = y_j + y_result[j]; } } @@ -480,136 +480,6 @@ singleLevelGemv (const char trans[], } } -struct TwoLevelGEMV_LayoutLeftTag {}; -struct TwoLevelGEMV_LayoutRightTag {}; - -// --------------------------------------------------------------------------------------------- -// Functor for a two-level parallel_reduce version of GEMV (non-transpose), -// designed for performance on GPU. Kernel depends on the layout of A. -template -struct TwoLevelGEMV { - using y_value_type = typename YViewType::non_const_value_type; - using AlphaCoeffType = typename AViewType::non_const_value_type; - using BetaCoeffType = typename YViewType::non_const_value_type; - - - using execution_space = typename AViewType::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - - TwoLevelGEMV (const AlphaCoeffType& alpha, - const AViewType& A, - const XViewType& x, - const BetaCoeffType& beta, - const YViewType& y) : - alpha_ (alpha), A_ (A), x_ (x), beta_ (beta), y_ (y) - { - static_assert (Kokkos::Impl::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert (Kokkos::Impl::is_view::value, - "XViewType must be a Kokkos::View."); - static_assert (Kokkos::Impl::is_view::value, - "YViewType must be a Kokkos::View."); - static_assert (static_cast (AViewType::rank) == 2, - "AViewType must have rank 2."); - static_assert (static_cast (XViewType::rank) == 1, - "XViewType must have rank 1."); - static_assert (static_cast (YViewType::rank) == 1, - "YViewType must have rank 1."); - static_assert (std::is_integral::value, - "IndexType must be an integer."); - } - -public: - //LayoutLeft version: 32xK blocks. - // -Each team handles block rows. - // -Groups of 32 threads handle N/teamsize columns sequentially, placing results into shared. - // -Then individual thread results are combined with parallel_reduce. - KOKKOS_INLINE_FUNCTION void - operator () (TwoLevelGEMV_LayoutLeftTag, const member_type& team) const - { - using Kokkos::Details::ArithTraits; - using Scalar = typename YViewType::non_const_value_type; - using KAT = ArithTraits; - //Allocate a Scalar in shared for each thread - Scalar* blockResult = (Scalar*) team.team_shmem().get_shmem(32 * sizeof(Scalar)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32), - [&](int i) - { - blockResult[i] = KAT::zero(); - }); - team.team_barrier(); - //Which block this thread will work on - int block = team.team_rank() / 32; - //Which row in the block this thread will work on - IndexType row = team.league_rank() * 32 + team.team_rank() % 32; - IndexType blockColStart = columnsPerThread * block; - Scalar localSum = KAT::zero(); - //compute local sum - if(row < (IndexType) A_.extent(0)) - { - for(IndexType col = blockColStart; col < blockColStart + columnsPerThread && col < A_.extent(1); col++) - { - //A access is coalesced, x access is a broadcast - localSum += A_(row, col) * x_(col); - } - } - //atomically combine local result into shared - Kokkos::atomic_add(&blockResult[team.team_rank() % 32], localSum); - team.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32), - [&](int i) - { - IndexType yrow = team.league_rank() * 32 + i; - if(yrow < (IndexType) A_.extent(0)) - { - if(beta_ == KAT::zero()) - y_(yrow) = alpha_ * blockResult[i]; - else - y_(yrow) = beta_ * y_(yrow) + alpha_ * blockResult[i]; - } - }); - } - - //LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void - operator () (TwoLevelGEMV_LayoutRightTag, const member_type& team) const - { - using Kokkos::Details::ArithTraits; - using KAT = ArithTraits; - - const IndexType N = A_.extent(1); - const int i = team.league_rank(); // batch id - - // parallel-reduce to compute val += A(:,j)' * x - y_value_type val = KAT::zero(); - Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, N ), [&] ( const int j, y_value_type &update ) { - update += A_(i, j) * x_(j); - }, val); - - // compute yj = beta*yj + alpha*val - Kokkos::single(Kokkos::PerTeam(team), - [=]() - { - if(beta_ == KAT::zero()) - y_(i) = alpha_ * val; - else - y_(i) = beta_ * y_(i) + alpha_ * val; - }); - } - - IndexType columnsPerThread; -private: - AlphaCoeffType alpha_; - typename AViewType::const_type A_; - typename XViewType::const_type x_; - BetaCoeffType beta_; - YViewType y_; -}; - // --------------------------------------------------------------------------------------------- // Functor for a two-level parallel_reduce version of (conjugate) @@ -659,29 +529,23 @@ struct TwoLevelTransposeGEMV { operator () (const member_type & team) const { using Kokkos::Details::ArithTraits; - using KAT_A = ArithTraits; - using KAT_Y = ArithTraits; + using KAT = ArithTraits; const IndexType M = A_.extent(0); const int j = team.league_rank(); // batch id // parallel-reduce to compute val += A(:,j)' * x - y_value_type val = KAT_Y::zero(); + y_value_type val = KAT:: zero(); Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, M ), [&] ( const int i, y_value_type &update ) { const auto x_i = x_(i); - const auto A_ij = conj ? KAT_A::conj (A_(i,j)) : A_(i,j); + const auto A_ij = conj ? KAT::conj (A_(i,j)) : A_(i,j); update += A_ij * x_i; }, val); // compute yj = beta*yj + alpha*val - Kokkos::single(Kokkos::PerTeam(team), - [&]() - { - if(beta_ == KAT_Y::zero()) - y_(j) = alpha_ * val; - else - y_(j) = beta_ * y_(j) + alpha_ * val; - }); + if (team.team_rank() == 0) { + y_[j] = beta_*y_[j] + alpha_ * val; + } } private: @@ -725,68 +589,38 @@ twoLevelGemv (const char trans[], using team_policy_type = Kokkos::TeamPolicy; using range_policy_type = Kokkos::RangePolicy; + using BetaCoeffType = typename YViewType::non_const_value_type; + using Kokkos::Details::ArithTraits; using KAT = ArithTraits; - using YKAT = ArithTraits; - const char tr = toupper(trans[0]); + const char tr = trans[0]; // The transpose and conjugate transpose cases where A has zero rows // need special handling. These are equivalent to y := beta*y. We // could implement this using KokkosBlas::scal, but we don't want to // depend on that or its implementation details. Instead, we reuse // an instantiation of the non-transpose case for alpha=0. - if (y.extent(0) == 0) - { - //no entries to update - return; - } - else if (x.extent(0) == 0) - { - if (beta == YKAT::zero ()) { + if (A.extent(0) == 0 && (tr != 'N' && tr != 'n')) { + if (beta == KAT::zero ()) { Kokkos::deep_copy (y, KAT::zero ()); } - else if (beta != YKAT::one ()) { + else if (beta != Kokkos::Details::ArithTraits::one ()) { // "Fake out" a scal() by using the non-transpose alpha=0, // general beta case. This assumes that the functor doesn't // check dimensions. using functor_type = SingleLevelNontransposeGEMV; functor_type functor (alpha, A, x, beta, y); - Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range_policy_type (0, y.extent(0)), functor); + Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range_policy_type (0, A.extent(1)), functor); } return; } - if (tr == 'N') { - constexpr bool isLayoutLeft = std::is_same::value; - using layout_tag = typename std::conditional::type; - using tagged_policy = Kokkos::TeamPolicy; - using functor_type = TwoLevelGEMV; - functor_type functor (alpha, A, x, beta, y); - tagged_policy team; - if(isLayoutLeft) - { - size_t sharedPerTeam = 32 * sizeof(y_value_type); - IndexType numTeams = (A.extent(0) + 31) / 32; - tagged_policy temp(1, 1); - int teamSize = temp.team_size_max(functor, Kokkos::ParallelForTag()); - //make sure teamSize is a multiple of 32 - teamSize -= teamSize % 32; - //don't make teamSize larger than what's useful - if((size_t) teamSize > 32 * A.extent(1)) - teamSize = 32 * A.extent(1); - int numBlocks = teamSize / 32; - functor.columnsPerThread = (A.extent(1) + numBlocks - 1) / numBlocks; - team = tagged_policy(numTeams, teamSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)); - } - else - { - //LayoutRight: one team per row - team = tagged_policy(A.extent(0), Kokkos::AUTO); - } - Kokkos::parallel_for ("KokkosBlas::gemv[twoLevel]", team, functor); + if (tr == 'N' || tr == 'n') { + // NOTE: not implemented, so just call single-level version + singleLevelGemv + (trans, alpha, A, x, beta, y); } else { if (alpha == KAT::zero () && beta == KAT::zero ()) { @@ -796,7 +630,7 @@ twoLevelGemv (const char trans[], else if (alpha == KAT::zero () && beta == KAT::one ()) { // Do nothing (y := 1 * y) } - else if (tr == 'T') { + else if (tr == 'T' || tr == 't') { // transpose, and not conj transpose team_policy_type team (A.extent(1), Kokkos::AUTO); using functor_type = TwoLevelTransposeGEMV()>::type* = nullptr> -void -generalGemvImpl (const char trans[], - typename AViewType::const_value_type& alpha, - const AViewType& A, - const XViewType& x, - typename YViewType::const_value_type& beta, - const YViewType& y) -{ - singleLevelGemv (trans, alpha, A, x, beta, y); -} - -template()>::type* = nullptr> -void -generalGemvImpl (const char trans[], - typename AViewType::const_value_type& alpha, - const AViewType& A, - const XViewType& x, - typename YViewType::const_value_type& beta, - const YViewType& y) -{ - twoLevelGemv (trans, alpha, A, x, beta, y); -} - } // namespace Impl } // namespace KokkosBlas diff --git a/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_spec.hpp b/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_spec.hpp index da7983b07a79..76d98c65bc16 100644 --- a/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -136,12 +136,22 @@ struct GEMV { // Prefer int as the index type, but use a larger type if needed. if (numRows < static_cast (INT_MAX) && numCols < static_cast (INT_MAX)) { - generalGemvImpl + #if 1 + twoLevelGemv (trans, alpha, A, x, beta, y); + #else + singleLevelGemv + (trans, alpha, A, x, beta, y); + #endif } else { - generalGemvImpl + #if 1 + twoLevelGemv + (trans, alpha, A, x, beta, y); + #else + singleLevelGemv (trans, alpha, A, x, beta, y); + #endif } Kokkos::Profiling::popRegion(); } diff --git a/packages/kokkos-kernels/src/common/KokkosKernels_BitUtils.hpp b/packages/kokkos-kernels/src/common/KokkosKernels_BitUtils.hpp index c845e37c5318..7c343ff5a458 100644 --- a/packages/kokkos-kernels/src/common/KokkosKernels_BitUtils.hpp +++ b/packages/kokkos-kernels/src/common/KokkosKernels_BitUtils.hpp @@ -46,6 +46,10 @@ #define _KOKKOSKERNELS_BITUTILS_HPP #include "Kokkos_Core.hpp" +#if defined (KOKKOS_COMPILER_MSVC) +#include +#endif + namespace KokkosKernels{ namespace Impl{ @@ -203,6 +207,36 @@ int pop_count( long long i ){ return __popcnt8(i); } +#elif defined (KOKKOS_COMPILER_MSVC) +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( unsigned i ){ + return __popcnt(i); +} +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( unsigned long i ){ + return __popcnt(i); +} + +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( unsigned long long i ){ + return __popcnt64(i); +} + +KOKKOS_FORCEINLINE_FUNCTION +int pop_count(int i ){ + return __popcnt(i); +} + +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( long i ){ + return __popcnt(i); +} + +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( long long i ){ + return __popcnt64(i); +} + #else #error "Popcount function is not defined for this compiler. Please report this with the compiler you are using to KokkosKernels." #endif @@ -328,6 +362,35 @@ int least_set_bit( long long i ){ return __builtin_ffsll(i); } +#elif defined (KOKKOS_COMPILER_MSVC) +KOKKOS_FORCEINLINE_FUNCTION +int least_set_bit( unsigned i ){ + return __lzcnt(i); +} +KOKKOS_FORCEINLINE_FUNCTION +int least_set_bit( unsigned long i ){ + return __lzcnt(i); +} + +KOKKOS_FORCEINLINE_FUNCTION +int least_set_bit( unsigned long long i ){ + return __lzcnt64(i); +} + +KOKKOS_FORCEINLINE_FUNCTION +int least_set_bit( int i ){ + return __lzcnt(i); +} +KOKKOS_FORCEINLINE_FUNCTION +int least_set_bit( long i ){ + return __lzcnt(i); +} + +KOKKOS_FORCEINLINE_FUNCTION +int least_set_bit( long long i ){ + return __lzcnt64(i); +} + #else #error "least_set_bit function is not defined for this compiler. Please report this with the compiler you are using to KokkosKernels." #endif diff --git a/packages/kokkos-kernels/src/common/KokkosKernels_Handle.hpp b/packages/kokkos-kernels/src/common/KokkosKernels_Handle.hpp index 39ac62267c2b..fb557c6f5192 100644 --- a/packages/kokkos-kernels/src/common/KokkosKernels_Handle.hpp +++ b/packages/kokkos-kernels/src/common/KokkosKernels_Handle.hpp @@ -635,9 +635,6 @@ class KokkosKernelsHandle } void destroy_gs_handle(){ if (is_owner_of_the_gs_handle && this->gsHandle != NULL){ - if (this->gsHandle->is_owner_of_coloring()){ - this->destroy_graph_coloring_handle(); - } delete this->gsHandle; this->gsHandle = NULL; } diff --git a/packages/kokkos-kernels/src/common/KokkosKernels_IOUtils.hpp b/packages/kokkos-kernels/src/common/KokkosKernels_IOUtils.hpp index b74834db5fcf..351480c3a3e9 100644 --- a/packages/kokkos-kernels/src/common/KokkosKernels_IOUtils.hpp +++ b/packages/kokkos-kernels/src/common/KokkosKernels_IOUtils.hpp @@ -422,11 +422,13 @@ struct Edge{ //////////////////////////////////////////////////////////////////////////////// inline size_t kk_get_file_size(const char* file) { - struct stat stat_buf; + // struct stat stat_buf; #ifdef _WIN32 + struct _stat stat_buf; int retval = _stat(file, &stat_buf); #else + struct stat stat_buf; int retval = stat(file, &stat_buf); #endif diff --git a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1Color.hpp b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1Color.hpp index b3dcc411660e..f33d6b757f38 100644 --- a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1Color.hpp +++ b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1Color.hpp @@ -73,7 +73,7 @@ void graph_color_symbolic( gch->set_tictoc(handle->get_verbose()); - color_view_type colors_out; //= color_view_type("Graph Colors", num_rows); + color_view_type colors_out; if(gch->get_vertex_colors().use_count() > 0){ colors_out = gch->get_vertex_colors(); } else { diff --git a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1ColorHandle.hpp index d75b359b961d..826d0da962f0 100644 --- a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1ColorHandle.hpp +++ b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1ColorHandle.hpp @@ -192,8 +192,8 @@ class GraphColoringHandle overall_coloring_time_phase4(0), overall_coloring_time_phase5(0), coloring_time(0), - num_phases(0), size_of_edge_list(0), lower_triangle_src(), lower_triangle_dst(), use_vtx_list(false), - vertex_colors(), is_coloring_called_before(false), num_colors(0) + num_phases(0), size_of_edge_list(0), lower_triangle_src(), lower_triangle_dst(), + use_vtx_list(false), vertex_colors(), is_coloring_called_before(false), num_colors(0) { this->choose_default_algorithm(); this->set_defaults(this->coloring_algorithm_type); @@ -651,9 +651,9 @@ class GraphColoringHandle int get_num_phases() const { return this->num_phases;} color_view_t get_vertex_colors() const {return this->vertex_colors;} bool is_coloring_called() const {return this->is_coloring_called_before;} - bool get_use_vtx_list() const{return this->use_vtx_list;} + bool get_use_vtx_list() const {return this->use_vtx_list;} nnz_lno_temp_work_view_t get_vertex_list() const {return this->vertex_list;} - size_type get_vertex_list_size() const{return this->vertex_list_size;} + size_type get_vertex_list_size() const {return this->vertex_list_size;} //setters void set_vertex_list(nnz_lno_temp_work_view_t vertex_list_, size_type vertex_list_size_){ this->vertex_list = vertex_list_; diff --git a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2Color.hpp b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2Color.hpp index 53f2b4a26b2e..59a4f474393a 100644 --- a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2Color.hpp +++ b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2Color.hpp @@ -90,7 +90,7 @@ void graph_color_distance2( InternalEntries rowentries_internal(row_entries.data(), nnz); auto gch_d2 = handle->get_distance2_graph_coloring_handle(); //note: last template argument 'false' means do distance-2, not bipartite - Impl::GraphColorDistance2 + KokkosGraph::Impl::GraphColorDistance2 gc(num_verts, num_verts, rowmap_internal, rowentries_internal, rowmap_internal, rowentries_internal, gch_d2); gc.compute_distance2_color(); @@ -174,7 +174,7 @@ void bipartite_color_rows( } auto gch_d2 = handle->get_distance2_graph_coloring_handle(); //note: last template argument 'true' means do bipartite one-sided - Impl::GraphColorDistance2 + KokkosGraph::Impl::GraphColorDistance2 gc(num_rows, num_columns, rowmap_internal, rowentries_internal, colmap_internal, colentries_internal, gch_d2); gc.compute_distance2_color(); @@ -237,7 +237,7 @@ void bipartite_color_columns( InternalEntries rowentries_internal(row_entries.data(), nnz); auto gch_d2 = handle->get_distance2_graph_coloring_handle(); //note: last template argument 'true' means do bipartite one-sided - Impl::GraphColorDistance2 + KokkosGraph::Impl::GraphColorDistance2 gc(num_columns, num_rows, colmap_internal, colentries_internal, rowmap_internal, rowentries_internal, gch_d2); gc.compute_distance2_color(); diff --git a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2ColorHandle.hpp index 95b46e87079c..35402a72ffb6 100644 --- a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2ColorHandle.hpp +++ b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2ColorHandle.hpp @@ -122,7 +122,7 @@ class GraphColorDistance2Handle bool use_vtx_list; nnz_lno_temp_work_view_type vertex_list; - size_type vertex_list_size; + size_type vertex_list_size; int num_phases; // Number of phases used by the coloring algorithm @@ -148,7 +148,7 @@ class GraphColorDistance2Handle , overall_coloring_time_phase4(0) , overall_coloring_time_phase5(0) , coloring_time(0) - , use_vtx_list(false) + , use_vtx_list(false) , num_phases(0) , vertex_colors() , is_coloring_called_before(false) @@ -287,9 +287,9 @@ class GraphColorDistance2Handle bool is_coloring_called() const { return this->is_coloring_called_before; } - bool get_use_vtx_list() const {return this->use_vtx_list;} - nnz_lno_temp_work_view_type get_vertex_list() const {return this->vertex_list;} - size_type get_vertex_list_size() const {return this->vertex_list_size;} + bool get_use_vtx_list() const { return this->use_vtx_list; } + nnz_lno_temp_work_view_type get_vertex_list() const { return this->vertex_list; } + size_type get_vertex_list_size() const { return this->vertex_list_size; } // setters void set_vertex_list(nnz_lno_temp_work_view_type vertex_list_, size_type vertex_list_size_){ diff --git a/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 340bc3fc2f07..22ca44cc11d5 100644 --- a/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -368,12 +368,13 @@ class GraphColor_VB:public GraphColor nv); nnz_lno_t current_vertexListLength = this->nv; - - //init vertexList sequentially. + if(this->cp->get_use_vtx_list()){ + //get the vertexList from the color handle, if it exists. current_vertexList = this->cp->get_vertex_list(); current_vertexListLength = this->cp->get_vertex_list_size(); } else { + //init vertexList sequentially. Kokkos::parallel_for("KokkosGraph::GraphColoring::InitList", my_exec_space(0, this->nv), functorInitList (current_vertexList)); } @@ -2526,6 +2527,7 @@ class GraphColor_EB:public GraphColor 0){ color_t colorsize = sizeof(color_t) * 8 - 1; - color_set(ii) = (kokcolors(ii) - 1) / colorsize; - kokcolors(ii) = 1 << ((kokcolors(ii) - 1) % colorsize); + color_set(ii) = (kokcolors(ii) - 1) / colorsize; + kokcolors(ii) = 1 << ((kokcolors(ii) - 1) % colorsize); } color_ban(ii) = color_ban_init_val; } diff --git a/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp b/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp index 70760943213d..72a617dc4b3a 100644 --- a/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp +++ b/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp @@ -191,13 +191,12 @@ class GraphColorDistance2 { //Delegate to different coloring functions, depending on algorithm using_edge_filtering = false; - //color_view_type colors_out("Graph Colors", this->nr); - color_view_type colors_out; - if(gc_handle->get_vertex_colors().use_count() > 0){ - colors_out = gc_handle->get_vertex_colors(); - } else { - colors_out = color_view_type("Graph Colors", this->nr); - } + color_view_type colors_out; + if(gc_handle->get_vertex_colors().use_count() > 0){ + colors_out = gc_handle->get_vertex_colors(); + } else { + colors_out = color_view_type("Graph Colors", this->nr); + } switch(this->gc_handle->get_coloring_algo_type()) { case COLORING_D2_VB_BIT_EF: @@ -251,15 +250,15 @@ class GraphColorDistance2 Kokkos::ViewAllocateWithoutInitializing("vertexList"), this->nr); lno_t current_vertexListLength = this->nr; - - // init conflictlist sequentially. + if(this->gc_handle->get_use_vtx_list()){ - current_vertexList = this->gc_handle->get_vertex_list(); - current_vertexListLength = this->gc_handle->get_vertex_list_size(); - } else { - Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), functorInitList(current_vertexList)); - } - + //init conflict list from coloring handle + current_vertexList = this->gc_handle->get_vertex_list(); + current_vertexListLength = this->gc_handle->get_vertex_list_size(); + } else { + // init conflictlist sequentially. + Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), functorInitList(current_vertexList)); + } // Next iteratons's conflictList lno_view_t next_iteration_recolorList(Kokkos::ViewAllocateWithoutInitializing("recolorList"), this->nr); @@ -457,7 +456,6 @@ class GraphColorDistance2 break; } } - //make sure vertices with a valid color do not get recolored if(color && (colors(v) == 0 || colors(v) == CONFLICTED || colors(v) == UNCOLORABLE)) { //Color v @@ -732,7 +730,7 @@ class GraphColorDistance2 } const lno_t numVerts = this->nr; const lno_t numCols = this->nc; - //note: initializing forbidden to account for previously-colored vertices + //note: relying on forbidden and colors_out being initialized to 0 forbidden_view forbidden("Forbidden", batch * numCols); int iter = 0; Kokkos::Impl::Timer timer; @@ -750,8 +748,9 @@ class GraphColorDistance2 lno_t vertsPerThread = 1; lno_t workBatches = (currentWork + vertsPerThread - 1) / vertsPerThread; timer.reset(); - //refresh forbidden before coloring, to ensure previously-colored vertices do not get recolored unnecessarily. - //This avoids using too many colors, by relying on forbidden from before conflict resolution (which is now stale). + //if still using this color set, refresh forbidden. + //This avoids using too many colors, by relying on forbidden from before previous conflict resolution (which is now stale). + //Refreshing forbidden before conflict resolution ensures that previously-colored vertices do not get recolored. switch(batch) { case 1: @@ -772,8 +771,8 @@ class GraphColorDistance2 break; default:; } - forbiddenTime += timer.seconds(); - timer.reset(); + forbiddenTime += timer.seconds(); + timer.reset(); switch(batch) { case 1: diff --git a/packages/kokkos-kernels/src/sparse/KokkosSparse_CrsMatrix.hpp b/packages/kokkos-kernels/src/sparse/KokkosSparse_CrsMatrix.hpp index d734d9ac3ac5..3ce574602cf2 100644 --- a/packages/kokkos-kernels/src/sparse/KokkosSparse_CrsMatrix.hpp +++ b/packages/kokkos-kernels/src/sparse/KokkosSparse_CrsMatrix.hpp @@ -554,7 +554,37 @@ class CrsMatrix { OrdinalType* rowmap, OrdinalType* cols) { - ctor_impl (label, nrows, ncols, annz, val, rowmap, cols); + using Kokkos::Unmanaged; + using HostRowmap = Kokkos::View; + using UnmanagedRowmap = Kokkos::View>; + using UnmanagedEntries = Kokkos::View>; + using UnmanagedValues = Kokkos::View>; + //Allocate device rowmap, entries, values views + typename row_map_type::non_const_type rowmapDevice(Kokkos::ViewAllocateWithoutInitializing("rowmap"), nrows + 1); + index_type entriesDevice(Kokkos::ViewAllocateWithoutInitializing("entries"), annz); + //given rowmap in ordinal_type, so may need to convert to size_type explicitly + HostRowmap rowmapConverted; + UnmanagedRowmap rowmapRaw; + if(!std::is_same::value) + { + rowmapConverted = HostRowmap(Kokkos::ViewAllocateWithoutInitializing("rowmap raw"), nrows + 1); + for(OrdinalType i = 0; i <= nrows; i++) + rowmapConverted(i) = rowmap[i]; + rowmapRaw = rowmapConverted; + } + else + { + rowmapRaw = UnmanagedRowmap((const SizeType*) rowmap, nrows + 1); + } + Kokkos::deep_copy(rowmapDevice, rowmapRaw); + UnmanagedEntries entriesRaw(cols, annz); + Kokkos::deep_copy(entriesDevice, entriesRaw); + //Construct graph and populate all members + this->numCols_ = ncols; + this->graph = StaticCrsGraphType(entriesDevice, rowmapDevice); + this->values = values_type(Kokkos::ViewAllocateWithoutInitializing("values"), annz); + UnmanagedValues valuesRaw(val, annz); + Kokkos::deep_copy(this->values, valuesRaw); // FIXME (mfh 09 Aug 2013) Specialize this on the Device type. // Only use cuSPARSE for the Cuda Device. @@ -646,15 +676,6 @@ class CrsMatrix { #endif // KOKKOS_USE_CUSPARSE } - void - ctor_impl (const std::string &label, - const OrdinalType nrows, - const OrdinalType ncols, - const size_type annz, - ScalarType* val, - OrdinalType* rows, - OrdinalType* cols); - KOKKOS_INLINE_FUNCTION OrdinalType sumIntoValues (const OrdinalType rowi, @@ -883,50 +904,5 @@ class CrsMatrix { ordinal_type numCols_; }; -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template< typename ScalarType , typename OrdinalType, class Device, class MemoryTraits, typename SizeType > -void -CrsMatrix:: -ctor_impl (const std::string &label, - const OrdinalType nrows, - const OrdinalType ncols, - const size_type annz, - ScalarType* val, - OrdinalType* rows, - OrdinalType* cols) -{ - std::string str = label; - values = values_type (str.append (".values"), annz); - - numCols_ = ncols; - - // FIXME (09 Aug 2013) CrsArray only takes std::vector for now. - // We'll need to fix that. - std::vector row_lengths (nrows, 0); - - // FIXME (mfh 21 Jun 2013) This calls for a parallel_for kernel. - for (OrdinalType i = 0; i < nrows; ++i) { - row_lengths[i] = rows[i + 1] - rows[i]; - } - - graph = Kokkos::create_staticcrsgraph (str.append (".graph"), row_lengths); - typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values); - typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries); - - // FIXME (mfh 21 Jun 2013) This needs to be a parallel copy. - // Furthermore, why are the arrays copied twice? -- once here, to a - // host view, and once below, in the deep copy? - for (size_type i = 0; i < annz; ++i) { - if (val) { - h_values(i) = val[i]; - } - h_entries(i) = cols[i]; - } - - Kokkos::deep_copy (values, h_values); - Kokkos::deep_copy (graph.entries, h_entries); -} } #endif diff --git a/packages/kokkos-kernels/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/packages/kokkos-kernels/src/sparse/KokkosSparse_gauss_seidel_handle.hpp index 917680911556..efb9e0c62e70 100644 --- a/packages/kokkos-kernels/src/sparse/KokkosSparse_gauss_seidel_handle.hpp +++ b/packages/kokkos-kernels/src/sparse/KokkosSparse_gauss_seidel_handle.hpp @@ -135,8 +135,6 @@ namespace KokkosSparse{ //getters GSAlgorithm get_algorithm_type() const {return this->algorithm_type;} - virtual bool is_owner_of_coloring() const {return false;} - nnz_lno_persistent_work_host_view_t get_color_xadj() const { return this->color_xadj; } @@ -245,11 +243,18 @@ namespace KokkosSparse{ scalar_persistent_work_view_t permuted_inverse_diagonal; nnz_lno_t block_size; //this is for block sgs - nnz_lno_t max_nnz_input_row; - nnz_lno_t num_values_in_l1, num_values_in_l2, num_big_rows; size_t level_1_mem, level_2_mem; - bool owner_of_coloring; + + //Option set by user: rows with at least this many nonzeros are handled by a separate kernel + nnz_lno_t long_row_threshold; + //Number of long rows per color set. They are all grouped at the end of each color set. + nnz_lno_persistent_work_host_view_t long_rows_per_color; + //Maximum row length in each color set. + nnz_lno_persistent_work_host_view_t max_row_length_per_color; + //Temporary space for matvec over long rows - size is only max num long rows in a color. + scalar_persistent_work_view_t long_row_x; + public: /** @@ -260,17 +265,13 @@ namespace KokkosSparse{ permuted_xadj(), permuted_adj(), permuted_adj_vals(), old_to_new_map(), permuted_y_vector(), permuted_x_vector(), permuted_inverse_diagonal(), block_size(1), - max_nnz_input_row(-1), num_values_in_l1(-1), num_values_in_l2(-1),num_big_rows(0), level_1_mem(0), level_2_mem(0), - owner_of_coloring(false) + long_row_threshold(0) { if (gs == GS_DEFAULT) this->choose_default_algorithm(); } - bool is_owner_of_coloring() const override {return this->owner_of_coloring;} - void set_owner_of_coloring(bool owner = true) {this->owner_of_coloring = owner;} - void set_block_size(nnz_lno_t bs){this->block_size = bs; } nnz_lno_t get_block_size() const {return this->block_size;} @@ -363,14 +364,44 @@ namespace KokkosSparse{ return this->num_big_rows; } - nnz_lno_t get_max_nnz() const { - if(max_nnz_input_row == static_cast(-1)) - throw std::runtime_error("Requested max nnz per input row, but this has not been set in the PointGS handle."); - return this->max_nnz_input_row; + nnz_lno_t get_long_row_threshold() const + { + return long_row_threshold; + } + + void set_long_row_threshold(nnz_lno_t lrt) + { + long_row_threshold = lrt; + } + + nnz_lno_persistent_work_host_view_t get_long_rows_per_color() const + { + return long_rows_per_color; + } + + void set_long_rows_per_color(const nnz_lno_persistent_work_host_view_t& long_rows_per_color_) + { + long_rows_per_color = long_rows_per_color_; } - void set_max_nnz(nnz_lno_t num_result_nnz_) { - this->max_nnz_input_row = num_result_nnz_; + nnz_lno_persistent_work_host_view_t get_max_row_length_per_color() const + { + return max_row_length_per_color; + } + + void set_max_row_length_per_color(const nnz_lno_persistent_work_host_view_t& max_row_length_per_color_) + { + max_row_length_per_color = max_row_length_per_color_; + } + + scalar_persistent_work_view_t get_long_row_x() const + { + return long_row_x; + } + + void set_long_row_x(const scalar_persistent_work_view_t& long_row_x_) + { + long_row_x = long_row_x_; } void allocate_x_y_vectors(nnz_lno_t num_rows, nnz_lno_t num_cols, nnz_lno_t num_vecs){ @@ -514,7 +545,7 @@ namespace KokkosSparse{ throw std::runtime_error("inverse diagonal does not exist until after numeric setup."); return inverse_diagonal; } - + bool use_teams() const { return KokkosKernels::Impl::kk_is_gpu_exec_space(); diff --git a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index d5c111862fba..a4d74614d741 100644 --- a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -50,12 +50,12 @@ #include #include #include -#include #include #include "KokkosGraph_Distance1Color.hpp" #include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp" #include "KokkosKernels_BitUtils.hpp" #include "KokkosKernels_SimpleUtils.hpp" +#include "KokkosKernels_Sorting.hpp" //FOR DEBUGGING #include "KokkosBlas1_nrm2.hpp" @@ -103,9 +103,9 @@ namespace KokkosSparse{ typedef typename HandleType::scalar_persistent_work_view2d_t scalar_persistent_work_view2d_t; typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t; - typedef Kokkos::RangePolicy my_exec_space; - typedef nnz_lno_t color_t; - typedef Kokkos::View color_view_t; + typedef Kokkos::RangePolicy range_pol; + typedef typename HandleType::GraphColoringHandleType::color_view_t color_view_t; + typedef typename HandleType::GraphColoringHandleType::color_t color_t; typedef Kokkos::Bitset bitset_t; typedef Kokkos::ConstBitset const_bitset_t; @@ -114,9 +114,12 @@ namespace KokkosSparse{ struct BlockTag{}; struct BigBlockTag{}; + struct LongRowTag{}; - typedef Kokkos::TeamPolicy block_team_fill_policy_t ; - typedef Kokkos::TeamPolicy bigblock_team_fill_policy_t ; + typedef Kokkos::TeamPolicy block_apply_team_policy_t ; + typedef Kokkos::TeamPolicy bigblock_apply_team_policy_t ; + typedef Kokkos::RangePolicy longrow_apply_range_policy_t ; + typedef Kokkos::TeamPolicy longrow_apply_team_policy_t ; typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_scalar_t> pool_memory_space; private: @@ -145,7 +148,7 @@ namespace KokkosSparse{ bool is_symmetric; //Batch size for column applies. Used as a stack array size, so must be a compile-time constant. - static constexpr nnz_lno_t apply_batch_size = 16; + static constexpr nnz_lno_t apply_batch_size = 8; public: @@ -161,6 +164,11 @@ namespace KokkosSparse{ nnz_scalar_t omega; + scalar_persistent_work_view_t _long_row_x; //Results of simple Ax matvec over long rows. + nnz_lno_t _long_row_col; //Which X/Y column is now being processed for long rows. + nnz_lno_t _color_set_begin; //(only used for long rows): where the current set of rows begins + nnz_lno_t _long_row_par; + PSGS(row_lno_persistent_work_view_t xadj_, nnz_lno_persistent_work_view_t adj_, scalar_persistent_work_view_t adj_vals_, scalar_persistent_work_view2d_t Xvector_, scalar_persistent_work_view2d_t Yvector_, nnz_lno_persistent_work_view_t /* color_adj_ */, nnz_scalar_t omega_, @@ -197,6 +205,25 @@ namespace KokkosSparse{ _Xvector(ii, batch_start + i) += omega * sum[i] * invDiagonalVal; } } + + KOKKOS_INLINE_FUNCTION + void operator()(const LongRowTag&, const nnz_lno_t i) const { + nnz_lno_t row = _color_set_begin + i / _long_row_par; + nnz_lno_t chunk = i % _long_row_par; + size_type row_begin = _xadj(row); + size_type row_end = _xadj(row + 1); + size_type chunk_begin = row_begin + (row_end - row_begin) * chunk / _long_row_par; + size_type chunk_end = row_begin + (row_end - row_begin) * (chunk + 1) / _long_row_par; + if(chunk_end > row_end) + chunk_end = row_end; + nnz_scalar_t localSum{}; + for(size_type j = chunk_begin; j < chunk_end; j++) + { + nnz_lno_t colIndex = _adj(j); + localSum += _adj_vals(j) * _Xvector(colIndex, _long_row_col); + } + Kokkos::atomic_add(&_long_row_x(row - _color_set_begin), localSum); + } }; struct Team_PSGS{ @@ -226,6 +253,11 @@ namespace KokkosSparse{ typedef typename KokkosKernels::Impl::array_sum_reduce batch_sum; + nnz_lno_persistent_work_view_t _long_rows; + scalar_persistent_work_view_t _long_row_x; + nnz_lno_t _long_row_col; //Which X/Y column is now being processed for long rows. + nnz_lno_t _long_row_par; + Team_PSGS(row_lno_persistent_work_view_t xadj_, nnz_lno_persistent_work_view_t adj_, scalar_persistent_work_view_t adj_vals_, scalar_persistent_work_view2d_t Xvector_, scalar_persistent_work_view2d_t Yvector_, nnz_lno_t color_set_begin, nnz_lno_t color_set_end, @@ -579,11 +611,118 @@ namespace KokkosSparse{ }); } + KOKKOS_INLINE_FUNCTION + void operator()(const LongRowTag&, const team_member_t& teamMember) const { + nnz_lno_t row = _color_set_begin + teamMember.league_rank() / _long_row_par; + nnz_lno_t chunk = teamMember.league_rank() % _long_row_par; + size_type row_begin = _xadj(row); + size_type row_end = _xadj(row + 1); + size_type chunk_begin = row_begin + (row_end - row_begin) * chunk / _long_row_par; + size_type chunk_end = row_begin + (row_end - row_begin) * (chunk + 1) / _long_row_par; + if(chunk_end > row_end) + chunk_end = row_end; + nnz_scalar_t localSum; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, chunk_begin, chunk_end), + [&](size_type j, nnz_scalar_t& lsum) + { + nnz_lno_t colIndex = _adj(j); + lsum += _adj_vals(j) * _Xvector(colIndex, _long_row_col); + }, localSum); + Kokkos::single(Kokkos::PerTeam(teamMember), + [&]() + { + Kokkos::atomic_add(&_long_row_x(row - _color_set_begin), localSum); + }); + } + size_t team_shmem_size (int /* team_size */) const { return shared_memory_size; } }; + struct LongRowComparator + { + KOKKOS_DEFAULTED_FUNCTION LongRowComparator() = default; + KOKKOS_INLINE_FUNCTION LongRowComparator(const in_lno_row_view_t& xadj_, nnz_lno_t longRowThreshold_) + : xadj(xadj_), longRowThreshold(longRowThreshold_) + {} + + KOKKOS_INLINE_FUNCTION bool operator()(nnz_lno_t lhs, nnz_lno_t rhs) const + { + int lhsLong = xadj(lhs + 1) - xadj(lhs) >= longRowThreshold; + int rhsLong = xadj(rhs + 1) - xadj(rhs) >= longRowThreshold; + if(lhsLong < rhsLong) + return true; + else if(lhsLong > rhsLong) + return false; + //Either both long or both short, just order ascending by ID + return lhs < rhs; + } + + in_lno_row_view_t xadj; + size_type longRowThreshold; + }; + + //Functor to sort each color set - first by whether 'long row', second by ID. + //Also populates long_rows_per_color. + struct SortIntoLongRowsFunctor + { + SortIntoLongRowsFunctor( + const in_lno_row_view_t& xadj_, nnz_lno_t longRowThreshold_, + const nnz_lno_persistent_work_view_t& color_xadj_, const nnz_lno_persistent_work_view_t& color_adj_, + const nnz_lno_persistent_work_view_t& long_rows_per_color_, const nnz_lno_persistent_work_view_t& max_row_length_per_color_) + : xadj(xadj_), longRowThreshold(longRowThreshold_), color_xadj(color_xadj_), color_adj(color_adj_), + long_rows_per_color(long_rows_per_color_), max_row_length_per_color(max_row_length_per_color_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(const team_member_t& t, nnz_lno_t& lmostPerColor) const + { + LongRowComparator comp(xadj, longRowThreshold); + nnz_lno_t color = t.league_rank(); + nnz_lno_t colorBegin = color_xadj(color); + nnz_lno_t colorLen = color_xadj(color + 1) - colorBegin; + KokkosKernels::Impl::TeamBitonicSort(color_adj.data() + colorBegin, colorLen, t, comp); + t.team_barrier(); + //Now that the color set is sorted, count how many long rows there were + nnz_lno_t numLongRows; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(t, colorBegin, colorBegin + colorLen), + [&](nnz_lno_t i, nnz_lno_t& lnumLongRows) + { + nnz_lno_t row = color_adj(i); + if(xadj(row + 1) - xadj(row) >= longRowThreshold) + lnumLongRows++; + }, numLongRows); + Kokkos::single(Kokkos::PerTeam(t), + [&]() + { + long_rows_per_color(color) = numLongRows; + if(numLongRows > lmostPerColor) + lmostPerColor = numLongRows; + }); + nnz_lno_t max_row_length = 0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(t, colorBegin, colorBegin + colorLen), + [&](nnz_lno_t i, nnz_lno_t& lmaxLength) + { + nnz_lno_t row = color_adj(i); + nnz_lno_t len = xadj(row + 1) - xadj(row); + if(len > lmaxLength) + lmaxLength = len; + }, Kokkos::Max(max_row_length)); + Kokkos::single(Kokkos::PerTeam(t), + [&]() + { + max_row_length_per_color(color) = max_row_length; + }); + } + + in_lno_row_view_t xadj; + size_type longRowThreshold; + nnz_lno_persistent_work_view_t color_xadj; + nnz_lno_persistent_work_view_t color_adj; + nnz_lno_persistent_work_view_t long_rows_per_color; + nnz_lno_persistent_work_view_t max_row_length_per_color; + }; + /** * \brief constructor */ @@ -615,7 +754,6 @@ namespace KokkosSparse{ is_symmetric(is_symmetric_){} - /** * \brief constructor */ @@ -651,14 +789,11 @@ namespace KokkosSparse{ void initialize_symbolic() { auto gsHandle = get_gs_handle(); - typename HandleType::GraphColoringHandleType *gchandle = this->handle->get_graph_coloring_handle(); + const size_type longRowThreshold = gsHandle->get_long_row_threshold(); - if (gchandle == NULL) - { - this->handle->create_graph_coloring_handle(); - gsHandle->set_owner_of_coloring(true); - gchandle = this->handle->get_graph_coloring_handle(); - } + //Validate settings + if(gsHandle->get_block_size() > 1 && longRowThreshold > 0) + throw std::runtime_error("Can't use MTGS long row algorithm with blocks."); const_lno_row_view_t xadj = this->row_map; const_lno_nnz_view_t adj = this->entries; @@ -669,31 +804,36 @@ namespace KokkosSparse{ #endif typename HandleType::GraphColoringHandleType::color_view_t colors; color_t numColors; - if (!is_symmetric) { - if (gchandle->get_coloring_algo_type() == KokkosGraph::COLORING_EB) { - - gchandle->symmetrize_and_calculate_lower_diagonal_edge_list(num_rows, xadj, adj); - KokkosGraph::Experimental::graph_color_symbolic - (this->handle, num_rows, num_rows, xadj, adj); + { + HandleType coloringHandle; + coloringHandle.create_graph_coloring_handle(); + auto gchandle = coloringHandle.get_graph_coloring_handle(); + if (!is_symmetric) { + if (gchandle->get_coloring_algo_type() == KokkosGraph::COLORING_EB) { + + gchandle->symmetrize_and_calculate_lower_diagonal_edge_list(num_rows, xadj, adj); + KokkosGraph::Experimental::graph_color_symbolic + (&coloringHandle, num_rows, num_rows, xadj, adj); + } + else { + row_lno_temp_work_view_t tmp_xadj; + nnz_lno_temp_work_view_t tmp_adj; + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap + < const_lno_row_view_t, const_lno_nnz_view_t, + row_lno_temp_work_view_t, nnz_lno_temp_work_view_t, + MyExecSpace> + (num_rows, xadj, adj, tmp_xadj, tmp_adj); + KokkosGraph::Experimental::graph_color_symbolic + (&coloringHandle, num_rows, num_rows, tmp_xadj, tmp_adj); + } } else { - row_lno_temp_work_view_t tmp_xadj; - nnz_lno_temp_work_view_t tmp_adj; - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap - < const_lno_row_view_t, const_lno_nnz_view_t, - row_lno_temp_work_view_t, nnz_lno_temp_work_view_t, - MyExecSpace> - (num_rows, xadj, adj, tmp_xadj, tmp_adj); - KokkosGraph::Experimental::graph_color_symbolic - (this->handle, num_rows, num_rows, tmp_xadj, tmp_adj); + KokkosGraph::Experimental::graph_color_symbolic + (&coloringHandle, num_rows, num_rows, xadj, adj); } + colors = gchandle->get_vertex_colors(); + numColors = gchandle->get_num_colors(); } - else { - KokkosGraph::Experimental::graph_color_symbolic - (this->handle, num_rows, num_rows, xadj, adj); - } - colors = gchandle->get_vertex_colors(); - numColors = gchandle->get_num_colors(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "COLORING_TIME:" << timer.seconds() << std::endl; timer.reset(); @@ -718,48 +858,64 @@ namespace KokkosSparse{ (num_rows, numColors, colors, color_xadj, color_adj); - MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl; timer.reset(); #endif nnz_lno_persistent_work_host_view_t h_color_xadj = Kokkos::create_mirror_view (color_xadj); Kokkos::deep_copy (h_color_xadj , color_xadj); - MyExecSpace().fence(); - #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "DEEP_COPY:" << timer.seconds() << std::endl; timer.reset(); #endif - - // TODO BMK: Why are the vertices in each color set only being sorted on GPU? - // Wouldn't it have a locality benefit on CPU too? - if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if(longRowThreshold > 0) + { + //Count long rows per color set, and sort color sets so that long rows come after regular rows + nnz_lno_persistent_work_view_t long_rows_per_color(Kokkos::ViewAllocateWithoutInitializing("long_rows_per_color"), numColors); + nnz_lno_persistent_work_view_t max_row_length_per_color(Kokkos::ViewAllocateWithoutInitializing("max_row_length_per_color"), numColors); + nnz_lno_t mostLongRowsInColor = 0; + Kokkos::parallel_reduce(team_policy_t(numColors, Kokkos::AUTO()), + SortIntoLongRowsFunctor(xadj, longRowThreshold, color_xadj, color_adj, long_rows_per_color, max_row_length_per_color), + Kokkos::Max(mostLongRowsInColor)); + auto host_long_rows_per_color = Kokkos::create_mirror_view(long_rows_per_color); + Kokkos::deep_copy(host_long_rows_per_color, long_rows_per_color); + gsHandle->set_long_rows_per_color(host_long_rows_per_color); + auto host_max_row_length_per_color = Kokkos::create_mirror_view(max_row_length_per_color); + Kokkos::deep_copy(host_max_row_length_per_color, max_row_length_per_color); + gsHandle->set_max_row_length_per_color(host_max_row_length_per_color); + scalar_persistent_work_view_t long_row_x(Kokkos::ViewAllocateWithoutInitializing("long_row_x"), mostLongRowsInColor); + gsHandle->set_long_row_x(long_row_x); + } + else + { + //Just sort rows by ID. KokkosKernels::Impl::sort_crs_graph(color_xadj, color_adj); - MyExecSpace().fence(); + } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - std::cout << "SORT_TIME:" << timer.seconds() << std::endl; - timer.reset(); + MyExecSpace().fence(); + std::cout << "SORT_TIME:" << timer.seconds() << std::endl; + timer.reset(); #endif - } row_lno_persistent_work_view_t permuted_xadj ("new xadj", num_rows + 1); nnz_lno_persistent_work_view_t old_to_new_map ("old_to_new_index_", num_rows ); nnz_lno_persistent_work_view_t permuted_adj ("newadj_", nnz ); - Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::create_permuted_xadj", my_exec_space(0,num_rows), + Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::create_permuted_xadj", range_pol(0,num_rows), create_permuted_xadj( color_adj, xadj, permuted_xadj, old_to_new_map)); //std::cout << "create_permuted_xadj" << std::endl; - MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "CREATE_PERMUTED_XADJ:" << timer.seconds() << std::endl; timer.reset(); @@ -768,15 +924,15 @@ namespace KokkosSparse{ KokkosKernels::Impl::inclusive_parallel_prefix_sum (num_rows + 1, permuted_xadj); - MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "INCLUSIVE_PPS:" << timer.seconds() << std::endl; timer.reset(); #endif - Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::fill_matrix_symbolic",my_exec_space(0,num_rows), + Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::fill_matrix_symbolic",range_pol(0,num_rows), fill_matrix_symbolic( num_rows, color_adj, @@ -787,9 +943,9 @@ namespace KokkosSparse{ permuted_adj, //newvals_, old_to_new_map)); - MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "SYMBOLIC_FILL:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -803,7 +959,6 @@ namespace KokkosSparse{ //first calculate max row size. size_type max_row_size = 0; KokkosKernels::Impl::kk_view_reduce_max_row_size(num_rows, permuted_xadj.data(), permuted_xadj.data() + 1, max_row_size); - gsHandle->set_max_nnz(max_row_size); nnz_lno_t brows = permuted_xadj.extent(0) - 1; size_type bnnz = permuted_adj.extent(0) * block_size * block_size; @@ -884,7 +1039,6 @@ namespace KokkosSparse{ } } - gsHandle->set_max_nnz(max_row_size); gsHandle->set_level_1_mem(level_1_mem); gsHandle->set_level_2_mem(level_2_mem); @@ -899,10 +1053,6 @@ namespace KokkosSparse{ gsHandle->set_new_xadj(permuted_xadj); gsHandle->set_new_adj(permuted_adj); gsHandle->set_old_to_new_map(old_to_new_map); - if(gsHandle->is_owner_of_coloring()) { - this->handle->destroy_graph_coloring_handle(); - gsHandle->set_owner_of_coloring(false); - } gsHandle->set_call_symbolic(true); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "ALLOC:" << timer.seconds() << std::endl; @@ -1147,7 +1297,7 @@ namespace KokkosSparse{ if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric", - team_policy_t(num_rows / rows_per_team + 1 , suggested_team_size, suggested_vector_size), + team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), fill_matrix_numeric( color_adj, xadj, @@ -1163,7 +1313,7 @@ namespace KokkosSparse{ )); } else { - Kokkos::parallel_for( "KokkosSparse::GaussSeidel::fill_matrix_numeric",my_exec_space(0,num_rows), + Kokkos::parallel_for( "KokkosSparse::GaussSeidel::fill_matrix_numeric",range_pol(0,num_rows), fill_matrix_numeric( color_adj, xadj, @@ -1178,7 +1328,6 @@ namespace KokkosSparse{ block_matrix_size )); } - MyExecSpace().fence(); gsHandle->set_new_adj_val(permuted_adj_vals); scalar_persistent_work_view_t permuted_inverse_diagonal (Kokkos::ViewAllocateWithoutInitializing("permuted_inverse_diagonal"), num_rows * block_size ); @@ -1196,7 +1345,7 @@ namespace KokkosSparse{ } else { Kokkos::parallel_for("KokkosSparse::GaussSeidel::get_matrix_diagonals", - my_exec_space(0,num_rows), + range_pol(0,num_rows), gmd ); } @@ -1225,15 +1374,43 @@ namespace KokkosSparse{ } - MyExecSpace().fence(); gsHandle->set_permuted_inverse_diagonal(permuted_inverse_diagonal); gsHandle->set_call_numeric(true); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "NUMERIC:" << timer.seconds() << std::endl; #endif } + //Functor to update unknown entries corresponding to long rows (in the permuted x) + template + struct LongRowUpdateFunctor + { + LongRowUpdateFunctor( + const x_value_array_type& permuted_x_, + const y_value_array_type& permuted_y_, + const scalar_persistent_work_view_t& long_row_x_, + const scalar_persistent_work_view_t& permuted_inverse_diagonal_, + nnz_scalar_t omega_, + nnz_lno_t long_row_begin_) + : permuted_x(permuted_x_), permuted_y(permuted_y_), long_row_x(long_row_x_), + permuted_inverse_diagonal(permuted_inverse_diagonal_), omega(omega_), long_row_begin(long_row_begin_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(nnz_lno_t i) const + { + permuted_x(i) += omega * permuted_inverse_diagonal(i) * (permuted_y(i) - long_row_x(i - long_row_begin)); + } + + x_value_array_type permuted_x; + y_value_array_type permuted_y; + scalar_persistent_work_view_t long_row_x; + scalar_persistent_work_view_t permuted_inverse_diagonal; + nnz_scalar_t omega; + nnz_lno_t long_row_begin; + }; + template void block_apply( x_value_array_type x_lhs_output_vec, @@ -1275,7 +1452,6 @@ namespace KokkosSparse{ Permuted_Yvector ); } - MyExecSpace().fence(); if(init_zero_x_vector) { KokkosKernels::Impl::zero_vector(num_cols * block_size, Permuted_Xvector); } @@ -1288,7 +1464,6 @@ namespace KokkosSparse{ Permuted_Xvector ); } - MyExecSpace().fence(); #if KOKKOSSPARSE_IMPL_PRINTDEBUG std::cout << "Y:"; @@ -1346,7 +1521,7 @@ namespace KokkosSparse{ apply_backward); - //Kokkos::parallel_for( my_exec_space(0,nr), PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); + //Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); KokkosKernels::Impl::permute_block_vector @@ -1356,8 +1531,6 @@ namespace KokkosSparse{ Permuted_Xvector, x_lhs_output_vec ); - MyExecSpace().fence(); - #if KOKKOSSPARSE_IMPL_PRINTDEBUG std::cout << "After X:"; KokkosKernels::Impl::print_1Dview(Permuted_Xvector); @@ -1404,7 +1577,6 @@ namespace KokkosSparse{ Permuted_Yvector ); } - MyExecSpace().fence(); if(init_zero_x_vector) { KokkosKernels::Impl::zero_vector(num_cols, Permuted_Xvector); } @@ -1417,9 +1589,6 @@ namespace KokkosSparse{ Permuted_Xvector ); } - MyExecSpace().fence(); - - nnz_lno_persistent_work_host_view_t h_color_xadj = gsHandle->get_color_xadj(); #if KOKKOSSPARSE_IMPL_PRINTDEBUG std::cout << "--point Before X:"; @@ -1428,6 +1597,7 @@ namespace KokkosSparse{ KokkosKernels::Impl::print_1Dview(Permuted_Yvector,true); #endif + nnz_lno_persistent_work_host_view_t h_color_xadj = gsHandle->get_color_xadj(); if(gsHandle->get_algorithm_type() == GS_PERMUTED) { PSGS gs(newxadj, newadj, newadj_vals, Permuted_Xvector, Permuted_Yvector, color_adj, omega, permuted_inverse_diagonal); @@ -1454,7 +1624,7 @@ namespace KokkosSparse{ apply_backward); } - //Kokkos::parallel_for( my_exec_space(0,nr), PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); + //Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); KokkosKernels::Impl::permute_vector ( @@ -1463,7 +1633,6 @@ namespace KokkosSparse{ Permuted_Xvector, x_lhs_output_vec ); - MyExecSpace().fence(); #if KOKKOSSPARSE_IMPL_PRINTDEBUG std::cout << "--point After X:"; KokkosKernels::Impl::print_1Dview(Permuted_Xvector); @@ -1516,79 +1685,95 @@ namespace KokkosSparse{ nnz_lno_persistent_work_host_view_t h_color_xadj, int num_iteration, bool apply_forward, - bool apply_backward){ - - for (int i = 0; i < num_iteration; ++i){ - this->DoPSGS(gs, numColors, h_color_xadj, apply_forward, apply_backward); + bool apply_backward) + { + auto gsHandle = this->get_gs_handle(); + nnz_lno_persistent_work_host_view_t long_rows_per_color; + nnz_lno_persistent_work_host_view_t max_row_length_per_color; + scalar_persistent_work_view_t long_row_x; + bool haveLongRows = false; + int longRowTeamSize = 1; + if(gsHandle->get_long_row_threshold() > 0) + { + long_rows_per_color = gsHandle->get_long_rows_per_color(); + max_row_length_per_color = gsHandle->get_max_row_length_per_color(); + long_row_x = gsHandle->get_long_row_x(); + haveLongRows = true; + longrow_apply_team_policy_t tempPolicy(1, 1); + longRowTeamSize = tempPolicy.team_size_recommended(gs, Kokkos::ParallelForTag()); } - } - - void DoPSGS(Team_PSGS &gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, - bool apply_forward, - bool apply_backward){ - - nnz_lno_t suggested_team_size = gs.suggested_team_size; - nnz_lno_t team_row_chunk_size = gs.team_work_size; - int vector_size = gs.vector_size; - nnz_lno_t block_size = get_gs_handle()->get_block_size(); - if (apply_forward){ - gs.is_backward = false; - - for (color_t i = 0; i < numColors; ++i){ - nnz_lno_t color_index_begin = h_color_xadj(i); - nnz_lno_t color_index_end = h_color_xadj(i + 1); - int overall_work = color_index_end - color_index_begin;// /256 + 1; - gs._color_set_begin = color_index_begin; - gs._color_set_end = color_index_end; - - if (block_size == 1){ - Kokkos::parallel_for("KokkosSparse::GaussSeidel::Team_PSGS::forward", - team_policy_t(overall_work / team_row_chunk_size + 1 , suggested_team_size, vector_size), - gs ); - } else if (gs.num_max_vals_in_l2 == 0){ - Kokkos::parallel_for("KokkosSparse::GaussSeidel::BLOCK_Team_PSGS::forward", - block_team_fill_policy_t(overall_work / team_row_chunk_size + 1 , suggested_team_size, vector_size), - gs ); - } - else { - Kokkos::parallel_for("KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::forward", - bigblock_team_fill_policy_t(overall_work / team_row_chunk_size + 1 , suggested_team_size, vector_size), - gs ); - } + for (int iter = 0; iter < num_iteration; ++iter){ + nnz_lno_t suggested_team_size = gs.suggested_team_size; + nnz_lno_t team_row_chunk_size = gs.team_work_size; + int vector_size = gs.vector_size; + nnz_lno_t block_size = gsHandle->get_block_size(); - MyExecSpace().fence(); - } - } - if (apply_backward){ - gs.is_backward = true; - if (numColors > 0) - for (color_t i = numColors - 1; ; --i){ + for (int doingBackward = 0; doingBackward < 2; doingBackward++) { + const char* labelRegular = doingBackward ? "KokkosSparse::GaussSeidel::Team_PSGS::backward" : + "KokkosSparse::GaussSeidel::Team_PSGS::forward"; + const char* labelBlock = doingBackward ? "KokkosSparse::GaussSeidel::BLOCK_Team_PSGS::backward" : + "KokkosSparse::GaussSeidel::BLOCK_Team_PSGS::forward"; + const char* labelBigBlock = doingBackward ? "KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::backward" : + "KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::forward"; + const char* labelLong = doingBackward ? "KokkosSparse::GaussSeidel::Team_PSGS::backwardLongRows" : + "KokkosSparse::GaussSeidel::Team_PSGS::forwardLongRows"; + + if(!doingBackward && !apply_forward) + continue; + if(doingBackward && !apply_backward) + continue; + gs.is_backward = doingBackward; + + for (color_t colorIter = 0; colorIter < numColors; ++colorIter){ + //i is just the color set now being processed + color_t i = doingBackward ? (numColors - colorIter - 1) : colorIter; nnz_lno_t color_index_begin = h_color_xadj(i); nnz_lno_t color_index_end = h_color_xadj(i + 1); - nnz_lno_t numberOfTeams = color_index_end - color_index_begin;// /256 + 1; + nnz_lno_t numLongRows = haveLongRows ? long_rows_per_color(i) : 0; + nnz_lno_t numRegularRows = color_index_end - color_index_begin - numLongRows; + gs._color_set_begin = color_index_begin; - gs._color_set_end = color_index_end; - if (block_size == 1){ - Kokkos::parallel_for("KokkosSparse::GaussSeidel::Team_PSGS::backward", - team_policy_t(numberOfTeams / team_row_chunk_size + 1 , suggested_team_size, vector_size), - gs ); - } - else if ( gs.num_max_vals_in_l2 == 0){ - Kokkos::parallel_for("KokkosSparse::GaussSeidel::BLOCK_Team_PSGS::backward", - block_team_fill_policy_t(numberOfTeams / team_row_chunk_size + 1 , suggested_team_size, vector_size), - gs ); - } - else { - Kokkos::parallel_for("KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::backward", - bigblock_team_fill_policy_t(numberOfTeams / team_row_chunk_size + 1 , suggested_team_size, vector_size), - gs ); + gs._color_set_end = color_index_end - numLongRows; + + if (numRegularRows) { + if (block_size == 1){ + Kokkos::parallel_for(labelRegular, + team_policy_t((numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size), + gs ); + } else if (gs.num_max_vals_in_l2 == 0){ + Kokkos::parallel_for(labelBlock, + block_apply_team_policy_t((numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size), + gs ); + } + else { + Kokkos::parallel_for(labelBigBlock, + bigblock_apply_team_policy_t((numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size), + gs ); + } } - MyExecSpace().fence(); - if (i == 0){ - break; + if (numLongRows) { + gs._color_set_begin = color_index_end - numLongRows; + gs._color_set_end = color_index_end; + gs._long_row_x = long_row_x; + nnz_lno_t max_par = max_row_length_per_color(i); + nnz_lno_t teams_per_row = ((max_par + 3) / 4 + longRowTeamSize - 1) / longRowTeamSize; + gs._long_row_par = teams_per_row; + for(nnz_lno_t long_row_col = 0; long_row_col < gs._Xvector.extent_int(1); long_row_col++) { + auto Xcol = Kokkos::subview(gs._Xvector, Kokkos::ALL(), long_row_col); + auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); + gs._long_row_col = long_row_col; + Kokkos::deep_copy(long_row_x, nnz_scalar_t()); + Kokkos::parallel_for(labelLong, + longrow_apply_team_policy_t(numLongRows * teams_per_row, longRowTeamSize), gs); + Kokkos::parallel_for("KokkosSparse::GaussSeidel::LongRows::x_update", + range_pol(color_index_end - numLongRows, color_index_end), + LongRowUpdateFunctor + (Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); + } } } + } } } @@ -1598,34 +1783,60 @@ namespace KokkosSparse{ nnz_lno_persistent_work_host_view_t h_color_xadj, int num_iteration, bool apply_forward, - bool apply_backward){ - - for (int i = 0; i < num_iteration; ++i){ - this->DoPSGS(gs, numColors, h_color_xadj, apply_forward, apply_backward); + bool apply_backward) + { + auto gsHandle = this->get_gs_handle(); + nnz_lno_persistent_work_host_view_t long_rows_per_color; + nnz_lno_persistent_work_host_view_t max_row_length_per_color; + scalar_persistent_work_view_t long_row_x; + bool haveLongRows = false; + if(gsHandle->get_long_row_threshold() > 0) + { + long_rows_per_color = gsHandle->get_long_rows_per_color(); + max_row_length_per_color = gsHandle->get_max_row_length_per_color(); + long_row_x = gsHandle->get_long_row_x(); + gs._long_row_x = long_row_x; + haveLongRows = true; } - } - void DoPSGS(PSGS &gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, - bool apply_forward, - bool apply_backward){ - if (apply_forward){ - for (color_t i = 0; i < numColors; ++i){ - nnz_lno_t color_index_begin = h_color_xadj(i); - nnz_lno_t color_index_end = h_color_xadj(i + 1); - Kokkos::parallel_for ("KokkosSparse::GaussSeidel::PSGS::forward", - my_exec_space (color_index_begin, color_index_end) , gs); - MyExecSpace().fence(); - } - } - if (apply_backward && numColors){ - for (size_type i = numColors - 1; ; --i){ - nnz_lno_t color_index_begin = h_color_xadj(i); - nnz_lno_t color_index_end = h_color_xadj(i + 1); - Kokkos::parallel_for ("KokkosSparse::GaussSeidel::PSGS::backward", - my_exec_space (color_index_begin, color_index_end), gs); - MyExecSpace().fence(); - if (i == 0){ - break; + for (int iter = 0; iter < num_iteration; ++iter) { + for (int doingBackward = 0; doingBackward < 2; doingBackward++) { + if(!doingBackward && !apply_forward) + continue; + if(doingBackward && !apply_backward) + continue; + + for (color_t colorIter = 0; colorIter < numColors; ++colorIter) { + //i is just the color set now being processed + color_t i = doingBackward ? (numColors - colorIter - 1) : colorIter; + const char* labelShort = doingBackward ? "KokkosSparse::GaussSeidel::PSGS::backward" : + "KokkosSparse::GaussSeidel::PSGS::forward"; + const char* labelLong = doingBackward ? "KokkosSparse::GaussSeidel::PSGS::backwardLongRows" : + "KokkosSparse::GaussSeidel::PSGS::forwardLongRows"; + nnz_lno_t color_index_begin = h_color_xadj(i); + nnz_lno_t color_index_end = h_color_xadj(i + 1); + nnz_lno_t numLongRows = haveLongRows ? long_rows_per_color(i) : 0; + nnz_lno_t numRegularRows = color_index_end - color_index_begin - numLongRows; + if(numRegularRows) { + Kokkos::parallel_for (labelShort, range_pol (color_index_begin, color_index_end - numLongRows) , gs); + } + if(numLongRows) { + gs._color_set_begin = color_index_end - numLongRows; + nnz_lno_t max_par = max_row_length_per_color(i); + nnz_lno_t par_per_row = (max_par + 1023) / 1024; + gs._long_row_par = par_per_row; + for(nnz_lno_t long_row_col = 0; long_row_col < gs._Xvector.extent_int(1); long_row_col++) { + auto Xcol = Kokkos::subview(gs._Xvector, Kokkos::ALL(), long_row_col); + auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); + gs._long_row_col = long_row_col; + Kokkos::deep_copy(long_row_x, nnz_scalar_t()); + Kokkos::parallel_for (labelLong, Kokkos::RangePolicy(0, numLongRows * par_per_row), gs); + Kokkos::parallel_for("KokkosSparse::GaussSeidel::LongRows::x_update", + range_pol(color_index_end - numLongRows, color_index_end), + LongRowUpdateFunctor + (Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); + } + } } } } diff --git a/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp b/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp index f9936cc4d439..fa3d7194abfd 100644 --- a/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp +++ b/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp @@ -47,6 +47,8 @@ #include "KokkosKernels_Utils.hpp" #include "Kokkos_ArithTraits.hpp" +#include "KokkosSparse_spmv.hpp" +#include "gtest/gtest.h" //for EXPECT_** namespace Test { template::value> @@ -214,46 +216,6 @@ namespace Test { } }; - template - void vanillaGEMV(char mode, - typename ViewTypeA::non_const_value_type alpha, const ViewTypeA& A, const ViewTypeX& x, - typename ViewTypeY::non_const_value_type beta, const ViewTypeY& y) - { - using ScalarY = typename ViewTypeY::non_const_value_type; - using KAT_A = Kokkos::ArithTraits; - using KAT_Y = Kokkos::ArithTraits; - int M = A.extent(0); - int N = A.extent(1); - if(beta == KAT_Y::zero()) - Kokkos::deep_copy(y, KAT_Y::zero()); - if(mode == 'N') { - for(int i = 0; i < M; i++) { - ScalarY y_i = beta * y(i); - for(int j = 0; j < N; j++) { - y_i += alpha * A(i,j) * x(j); - } - y(i) = y_i; - } - } else if(mode == 'T') { - for(int j = 0; j < N; j++) { - ScalarY y_j = beta * y(j); - for(int i = 0; i < M; i++) { - y_j += alpha * A(i,j) * x(i); - } - y(j) = y_j; - } - } else if(mode == 'C') { - for(int j = 0; j < N; j++) { - ScalarY y_j = beta * y(j); - for(int i = 0; i < M; i++) { - y_j += alpha * KAT_A::conj (A(i,j)) * x(i); - } - y(j) = y_j; - } - } - } - - template class epsilon { public: @@ -292,5 +254,103 @@ namespace Test { start = Kokkos::complex(-mag, -mag); end = Kokkos::complex(mag, mag); } + + template + crsMat_t symmetrize(crsMat_t A) + { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + typedef typename graph_t::row_map_type::non_const_type lno_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + auto host_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + auto host_entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + auto host_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); + lno_t numRows = A.numRows(); + //symmetrize as input_mat + input_mat^T, to still have a diagonally dominant matrix + typedef std::map Row; + std::vector symRows(numRows); + for(lno_t r = 0; r < numRows; r++) + { + auto& row = symRows[r]; + for(size_type i = host_rowmap(r); i < host_rowmap(r + 1); i++) + { + lno_t c = host_entries(i); + auto& col = symRows[c]; + auto it = row.find(c); + if(it == row.end()) + row[c] = host_values(i); + else + row[c] += host_values(i); + it = col.find(r); + if(it == col.end()) + col[r] = host_values(i); + else + col[r] += host_values(i); + } + } + //Count entries + Kokkos::View new_host_rowmap("Rowmap", numRows + 1); + size_t accum = 0; + for(lno_t r = 0; r <= numRows; r++) + { + new_host_rowmap(r) = accum; + if(r < numRows) + accum += symRows[r].size(); + } + //Allocate new entries/values + Kokkos::View new_host_entries("Entries", accum); + Kokkos::View new_host_values("Values", accum); + for(lno_t r = 0; r < numRows; r++) + { + auto rowIt = symRows[r].begin(); + for(size_type i = new_host_rowmap(r); i < new_host_rowmap(r + 1); i++) + { + new_host_entries(i) = rowIt->first; + new_host_values(i) = rowIt->second; + rowIt++; + } + } + lno_view_t new_rowmap("Rowmap", numRows + 1); + lno_nnz_view_t new_entries("Entries", accum); + scalar_view_t new_values("Values", accum); + Kokkos::deep_copy(new_rowmap, new_host_rowmap); + Kokkos::deep_copy(new_entries, new_host_entries); + Kokkos::deep_copy(new_values, new_host_values); + return crsMat_t("SymA", numRows, numRows, accum, new_values, new_rowmap, new_entries); + } + + //create_random_x_vector and create_random_y_vector can be used together to generate a random + //linear system Ax = y. + template + vec_t create_random_x_vector(vec_t& kok_x, double max_value = 10.0) { + typedef typename vec_t::value_type scalar_t; + auto h_x = Kokkos::create_mirror_view (kok_x); + for (size_t j = 0; j < h_x.extent(1); ++j){ + for (size_t i = 0; i < h_x.extent(0); ++i){ + scalar_t r = + static_cast (rand()) / + static_cast (RAND_MAX / max_value); + h_x.access(i, j) = r; + } + } + Kokkos::deep_copy (kok_x, h_x); + return kok_x; + } + + template + vector_t create_random_y_vector(crsMat_t crsMat, vector_t x_vector){ + vector_t y_vector (Kokkos::ViewAllocateWithoutInitializing("Y VECTOR"), + crsMat.numRows()); + KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector); + return y_vector; + } + + template + vector_t create_random_y_vector_mv(crsMat_t crsMat, vector_t x_vector){ + vector_t y_vector (Kokkos::ViewAllocateWithoutInitializing("Y VECTOR"), + crsMat.numRows(), x_vector.extent(1)); + KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector); + return y_vector; + } } #endif diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas2_gemv.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas2_gemv.hpp index c9c01761244c..9ae63b5f8f17 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas2_gemv.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas2_gemv.hpp @@ -12,7 +12,6 @@ namespace Test { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeX::value_type ScalarX; typedef typename ViewTypeY::value_type ScalarY; - typedef Kokkos::ArithTraits KAT_Y; typedef multivector_layout_adapter vfA_type; typedef Kokkos::View::value ? 1e-3 : 5e-10); + ScalarX beta = 5; + double eps = (std::is_same::mag_type, float>::value ? 1e-3 : 1e-10); int ldx; int ldy; @@ -43,6 +42,7 @@ namespace Test { BaseTypeY b_y("Y", ldy); BaseTypeY b_org_y("Org_Y", ldy); + ViewTypeA A = vfA_type::view(b_A); ViewTypeX x = Kokkos::subview(b_x,Kokkos::ALL(),0); ViewTypeY y = Kokkos::subview(b_y,Kokkos::ALL(),0); @@ -85,56 +85,56 @@ namespace Test { Kokkos::deep_copy(h_b_y,b_y); Kokkos::deep_copy(h_b_A,b_A); + typedef Kokkos::Details::ArithTraits KAT; Kokkos::View expected("expected aAx+by", ldy); - Kokkos::deep_copy(expected, h_org_y); - vanillaGEMV(mode[0], alpha, h_A, h_x, beta, expected); + if(mode[0] == 'N') { + for(int i = 0; i < M; i++) { + ScalarY y_i = beta * h_org_y(i); + for(int j = 0; j < N; j++) { + y_i += alpha * h_A(i,j) * h_x(j); + } + expected(i) = y_i; + } + } else if(mode[0] == 'T') { + for(int j = 0; j < N; j++) { + ScalarY y_j = beta * h_org_y(j); + for(int i = 0; i < M; i++) { + y_j += alpha * h_A(i,j) * h_x(i); + } + expected(j) = y_j; + } + } else if(mode[0] == 'C') { + for(int j = 0; j < N; j++) { + ScalarY y_j = beta * h_org_y(j); + for(int i = 0; i < M; i++) { + y_j += alpha * KAT::conj (h_A(i,j)) * h_x(i); + } + expected(j) = y_j; + } + } KokkosBlas::gemv(mode, alpha, A, x, beta, y); Kokkos::deep_copy(h_b_y, b_y); - int numErrors = 0; for(int i = 0; i < ldy; i++) { - if(KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i))) - numErrors++; + EXPECT_NEAR_KK(expected(i), h_y(i), eps * expected(i)); } - EXPECT_EQ(numErrors, 0) << "Nonconst input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; Kokkos::deep_copy(b_y, b_org_y); KokkosBlas::gemv(mode, alpha,A ,c_x, beta, y); Kokkos::deep_copy(h_b_y, b_y); - numErrors = 0; for(int i = 0; i < ldy; i++) { - if(KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i))) - numErrors++; + EXPECT_NEAR_KK(expected(i), h_y(i), eps); } - EXPECT_EQ(numErrors, 0) << "Const vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; Kokkos::deep_copy(b_y, b_org_y); KokkosBlas::gemv(mode, alpha, c_A, c_x, beta, y); Kokkos::deep_copy(h_b_y, b_y); - numErrors = 0; - for(int i = 0; i < ldy; i++) - { - if(KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i))) - numErrors++; - } - EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; - //Test once with beta = 0, but with y initially filled with NaN. - //This should overwrite the NaNs with the correct result. - beta = KAT_Y::zero(); - //beta changed, so update the correct answer - vanillaGEMV(mode[0], alpha, h_A, h_x, beta, expected); - Kokkos::deep_copy(b_y, KAT_Y::nan()); - KokkosBlas::gemv(mode, alpha, A, x, beta, y); - Kokkos::deep_copy(h_b_y, b_y); - numErrors = 0; for(int i = 0; i < ldy; i++) { - if(KAT_Y::isNan(h_y(i)) || KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i))) - numErrors++; + EXPECT_NEAR_KK(expected(i), h_y(i), eps); } - EXPECT_EQ(numErrors, 0) << "beta = 0, input contains NaN, A is " << M << 'x' << N << ", mode " << mode << ": gemv incorrect"; } } @@ -156,12 +156,8 @@ int test_gemv(const char* mode) { Test::impl_test_gemv(mode,200,10); #endif Test::impl_test_gemv(mode,0,1024); - Test::impl_test_gemv(mode,1024,0); - Test::impl_test_gemv(mode,13,13); Test::impl_test_gemv(mode,13,1024); - Test::impl_test_gemv(mode,50,40); Test::impl_test_gemv(mode,1024,1024); - Test::impl_test_gemv(mode,4321,4321); //Test::impl_test_gemv(mode,132231,1024); #endif @@ -170,12 +166,8 @@ int test_gemv(const char* mode) { typedef Kokkos::View view_type_b_lr; typedef Kokkos::View view_type_c_lr; Test::impl_test_gemv(mode,0,1024); - Test::impl_test_gemv(mode,1024,0); - Test::impl_test_gemv(mode,13,13); Test::impl_test_gemv(mode,13,1024); - Test::impl_test_gemv(mode,50,40); Test::impl_test_gemv(mode,1024,1024); - Test::impl_test_gemv(mode,4321,4321); //Test::impl_test_gemv(mode,132231,1024); #endif @@ -184,12 +176,8 @@ int test_gemv(const char* mode) { typedef Kokkos::View view_type_b_ls; typedef Kokkos::View view_type_c_ls; Test::impl_test_gemv(mode,0,1024); - Test::impl_test_gemv(mode,1024,0); - Test::impl_test_gemv(mode,13,13); Test::impl_test_gemv(mode,13,1024); - Test::impl_test_gemv(mode,50,40); Test::impl_test_gemv(mode,1024,1024); - Test::impl_test_gemv(mode,4321,4321); //Test::impl_test_gemv(mode,132231,1024); #endif diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas3_gemm.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas3_gemm.hpp index 4ef8dea47781..580de25397e4 100644 --- a/packages/kokkos-kernels/unit_test/blas/Test_Blas3_gemm.hpp +++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas3_gemm.hpp @@ -7,7 +7,7 @@ namespace Test { template - struct gemm_VanillaGEMM { + struct VanillaGEMM { bool A_t, B_t, A_c, B_c; int N,K; ViewTypeA A; @@ -114,9 +114,8 @@ namespace Test { // Kokkos::fill_random(C,rand_pool,ScalarC(10)); Kokkos::deep_copy(C2,C); - Kokkos::fence(); - - struct gemm_VanillaGEMM vgemm; + + struct VanillaGEMM vgemm; vgemm.A_t = A_t; vgemm.B_t = B_t; vgemm.A_c = A_c; vgemm.B_c = B_c; vgemm.N = N; vgemm.K = K; @@ -125,7 +124,7 @@ namespace Test { vgemm.alpha = alpha; vgemm.beta = beta; - Kokkos::parallel_for("KokkosBlas::Test::gemm_VanillaGEMM", Kokkos::TeamPolicy(M,Kokkos::AUTO,16), vgemm); + Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", Kokkos::TeamPolicy(M,Kokkos::AUTO,16), vgemm); KokkosBlas::gemm(TA,TB,alpha,A,B,beta,C); @@ -152,49 +151,67 @@ namespace Test { } } -template -void test_gemm() -{ - typedef Kokkos::View view_type_a; - typedef Kokkos::View view_type_b; - typedef Kokkos::View view_type_c; - std::vector modes = {"N", "T"}; - if(std::is_same>::value || std::is_same>::value) - modes.push_back("C"); - Scalar alpha = 4.5; - std::vector betas = {0.0, 3.0}; - for(Scalar beta : betas) - { - for(auto amode : modes) - { - for(auto bmode : modes) - { - Test::impl_test_gemm(amode,bmode,0,0,0,alpha,beta); - //BMK: N = 1 exercises the special GEMV code path in GEMM (currently, only for modes N/N) - Test::impl_test_gemm(amode,bmode,50,1,40,alpha,beta); - Test::impl_test_gemm(amode,bmode,13,15,17,alpha,beta); - Test::impl_test_gemm(amode,bmode,179,15,211,alpha,beta); - Test::impl_test_gemm(amode,bmode,12,3071,517,alpha,beta); - } - } - } -} -template -void test_gemm_enabled_layouts() -{ + +template +int test_gemm(const char* mode, ScalarA alpha, ScalarB beta) { + #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - test_gemm(); + typedef Kokkos::View view_type_a_ll; + typedef Kokkos::View view_type_b_ll; + typedef Kokkos::View view_type_c_ll; + Test::impl_test_gemm(&mode[0],&mode[1],0,0,0,alpha,beta); + Test::impl_test_gemm(&mode[0],&mode[1],13,15,17,alpha,beta); + Test::impl_test_gemm(&mode[0],&mode[1],179,15,211,alpha,beta); + Test::impl_test_gemm(&mode[0],&mode[1],12,3071,517,alpha,beta); + //Test::impl_test_gemm(&mode[0],&mode[1],1024,1024,2048,alpha,beta); #endif + #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - test_gemm(); + typedef Kokkos::View view_type_a_lr; + typedef Kokkos::View view_type_b_lr; + typedef Kokkos::View view_type_c_lr; + Test::impl_test_gemm(&mode[0],&mode[1],0,0,0,alpha,beta); + Test::impl_test_gemm(&mode[0],&mode[1],13,15,17,alpha,beta); + Test::impl_test_gemm(&mode[0],&mode[1],179,15,211,alpha,beta); + Test::impl_test_gemm(&mode[0],&mode[1],12,3071,517,alpha,beta); + //Test::impl_test_gemm(&mode[0],&mode[1],1024,1024,2048,alpha,beta); +#endif +/* +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_gemv(mode,0,1024); + Test::impl_test_gemv(mode,13,1024); + Test::impl_test_gemv(mode,1024,1024); + Test::impl_test_gemv(mode,132231,1024); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_gemv(mode,1024,1024); + Test::impl_test_gemv(mode,1024,1024); #endif +*/ + return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, gemm_float ) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_float"); - test_gemm_enabled_layouts(); + float alpha = 5.0f; + float beta = 3.0f; + test_gemm ("NN",alpha,beta); + test_gemm ("TN",alpha,beta); + test_gemm ("NT",alpha,beta); + test_gemm ("TT",alpha,beta); + + alpha = 4.5f; + beta = 0.0f; + test_gemm ("NN",alpha,beta); + test_gemm ("TN",alpha,beta); + test_gemm ("NT",alpha,beta); + test_gemm ("TT",alpha,beta); Kokkos::Profiling::popRegion(); } #endif @@ -202,7 +219,19 @@ TEST_F( TestCategory, gemm_float ) { #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, gemm_double ) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_double"); - test_gemm_enabled_layouts(); + double alpha = 5.0; + double beta = 3.0; + test_gemm ("NN",alpha,beta); + test_gemm ("TN",alpha,beta); + test_gemm ("NT",alpha,beta); + test_gemm ("TT",alpha,beta); + + alpha = 4.5; + beta = 0.0; + test_gemm ("NN",alpha,beta); + test_gemm ("TN",alpha,beta); + test_gemm ("NT",alpha,beta); + test_gemm ("TT",alpha,beta); Kokkos::Profiling::popRegion(); } #endif @@ -210,7 +239,19 @@ TEST_F( TestCategory, gemm_double ) { #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, gemm_complex_double ) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_complex_double"); - test_gemm_enabled_layouts>(); + Kokkos::complex alpha = 5.0; + Kokkos::complex beta = 3.0; + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("NN",alpha,beta); + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("CN",alpha,beta); + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("NC",alpha,beta); + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("CC",alpha,beta); + + alpha = Kokkos::complex(4.5,0.0); + beta = 0.0; + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("NN",alpha,beta); + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("CN",alpha,beta); + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("NC",alpha,beta); + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("CC",alpha,beta); Kokkos::Profiling::popRegion(); } #endif @@ -218,8 +259,33 @@ TEST_F( TestCategory, gemm_complex_double ) { #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, gemm_complex_float ) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_complex_float"); - test_gemm_enabled_layouts>(); + Kokkos::complex alpha = 5.0f; + Kokkos::complex beta = 3.0f; + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("NN",alpha,beta); + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("CN",alpha,beta); + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("NC",alpha,beta); + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("CC",alpha,beta); + + alpha = Kokkos::complex(4.5f,0.0f); + beta = 0.0; + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("NN",alpha,beta); + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("CN",alpha,beta); + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("NC",alpha,beta); + test_gemm,Kokkos::complex,Kokkos::complex,TestExecSpace> ("CC",alpha,beta); Kokkos::Profiling::popRegion(); } #endif +/* +#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F( TestCategory, gemm_int ) { + test_gemm ("N"); +} +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F( TestCategory, gemm_double_int ) { + test_gemm ("N"); +} +#endif +*/ diff --git a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_CrsMatrix.hpp b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_CrsMatrix.hpp index 85b427d445d6..6caa9d96a150 100644 --- a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_CrsMatrix.hpp +++ b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_CrsMatrix.hpp @@ -47,11 +47,15 @@ #include #include #include "KokkosSparse_CrsMatrix.hpp" +#include "Kokkos_ArithTraits.hpp" -#ifndef kokkos_complex_double -#define kokkos_complex_double Kokkos::complex -#define kokkos_complex_float Kokkos::complex -#endif +// #ifndef kokkos_complex_double +// #define kokkos_complex_double Kokkos::complex +// #define kokkos_complex_float Kokkos::complex +// #endif + +typedef Kokkos::complex kokkos_complex_double; +typedef Kokkos::complex kokkos_complex_float; namespace Test{ // anonymous @@ -189,6 +193,40 @@ testCrsMatrix () //printf ("A is %d by %d\n", A.numRows (), A.numCols ()); } +template +void +testCrsMatrixRawConstructor() +{ + int nrows = 5; + //note: last 2 columns will be empty. + //This makes sure the ncols provided to constructor is preserved. + int ncols = 7; + int nnz = 9; + //NOTE: this is not a mistake, the raw ptr constructor takes rowmap as ordinal. + std::vector rowmap = {0, 0, 2, 5, 6, 9}; + std::vector entries = {3, 4, 0, 1, 2, 2, 0, 3, 4}; + std::vector values; + for(int i = 0; i < nnz; i++) + values.push_back(Kokkos::ArithTraits::one() * (1.0 * rand() / RAND_MAX)); + KokkosSparse::CrsMatrix A( + "A", nrows, ncols, nnz, values.data(), rowmap.data(), entries.data()); + EXPECT_EQ(A.numRows(), nrows); + EXPECT_EQ(A.numCols(), ncols); + EXPECT_EQ(A.nnz(), nnz); + //verify rowmap, entries, values: should all be identical to original raw arrays + //(except the rowmap elements are now size_type) + auto checkRowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + auto checkEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + auto checkValues = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); + for(int i = 0; i < nrows + 1; i++) + EXPECT_EQ(checkRowmap(i), (size_type) rowmap[i]); + for(int i = 0; i < nnz; i++) + { + EXPECT_EQ(checkEntries(i), entries[i]); + EXPECT_EQ(checkValues(i), values[i]); + } +} + template void testCrsMatrixHostMirror () @@ -226,6 +264,7 @@ testCrsMatrixHostMirror () #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( TestCategory, sparse ## _ ## crsmatrix ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ testCrsMatrix (); \ + testCrsMatrixRawConstructor (); \ } \ TEST_F( TestCategory, sparse ## _ ## crsmatrix_host_mirror ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ testCrsMatrixHostMirror (); \ @@ -329,4 +368,4 @@ TEST_F( TestCategory, sparse ## _ ## crsmatrix_host_mirror ## _ ## SCALAR ## _ # EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) #endif - +#undef EXECUTE_TEST diff --git a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp index 421e1a08889e..14187d3243bd 100644 --- a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp +++ b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp @@ -46,6 +46,7 @@ #include #include +#include "KokkosKernels_TestUtils.hpp" #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_IOUtils.hpp" //#include @@ -58,10 +59,13 @@ #include #include "KokkosSparse_gauss_seidel.hpp" -#ifndef kokkos_complex_double -#define kokkos_complex_double Kokkos::complex -#define kokkos_complex_float Kokkos::complex -#endif +// #ifndef kokkos_complex_double +// #define kokkos_complex_double Kokkos::complex +// #define kokkos_complex_float Kokkos::complex +// #endif + +typedef Kokkos::complex kokkos_complex_double; +typedef Kokkos::complex kokkos_complex_float; using namespace KokkosKernels; using namespace KokkosKernels::Experimental; @@ -139,37 +143,6 @@ int run_block_gauss_seidel_1( return 0; } -template -vec_t create_x_vector(vec_t& kok_x, double max_value = 10.0) { - typedef typename vec_t::value_type scalar_t; - auto h_x = Kokkos::create_mirror_view (kok_x); - for (size_t j = 0; j < h_x.extent(1); ++j){ - for (size_t i = 0; i < h_x.extent(0); ++i){ - scalar_t r = - static_cast (rand()) / - static_cast (RAND_MAX / max_value); - h_x.access(i, j) = r; - } - } - Kokkos::deep_copy (kok_x, h_x); - return kok_x; -} - -template -vector_t create_y_vector(crsMat_t crsMat, vector_t x_vector){ - vector_t y_vector (Kokkos::ViewAllocateWithoutInitializing("Y VECTOR"), - crsMat.numRows()); - KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector); - return y_vector; -} - -template -vector_t create_y_vector_mv(crsMat_t crsMat, vector_t x_vector){ - vector_t y_vector (Kokkos::ViewAllocateWithoutInitializing("Y VECTOR"), - crsMat.numRows(), x_vector.extent(1)); - KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector); - return y_vector; -} } template @@ -226,10 +199,10 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth lno_t nv = ((crsmat2.numRows() + block_size - 1) / block_size) * block_size; const scalar_view_t solution_x(Kokkos::ViewAllocateWithoutInitializing("X"), nv); - //create_x_vector operates on host mirror, then copies to device. But create_y does everything on device. - create_x_vector(solution_x); + //create_random_x_vector operates on host mirror, then copies to device. But create_y does everything on device. + create_random_x_vector(solution_x); exec_space().fence(); - scalar_view_t y_vector = create_y_vector(crsmat2, solution_x); + scalar_view_t y_vector = create_random_y_vector(crsmat2, solution_x); mag_t initial_norm_res = KokkosBlas::nrm2(solution_x); #ifdef gauss_seidel_testmore GSAlgorithm gs_algorithms[] ={GS_DEFAULT, GS_TEAM, GS_PERMUTED}; @@ -252,7 +225,7 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth bool is_symmetric_graph = true; size_t shmem_size = 32128; - + for(int i = 0; i < 2; ++i) { if (i == 1) shmem_size = 2008; //make the shmem small on gpus so that it will test 2 level algorithm. @@ -335,8 +308,8 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth constexpr lno_t numVecs = 2; scalar_view2d_t solution_x(Kokkos::ViewAllocateWithoutInitializing("X"), nv, numVecs); - create_x_vector(solution_x); - scalar_view2d_t y_vector = create_y_vector_mv(crsmat2, solution_x); + create_random_x_vector(solution_x); + scalar_view2d_t y_vector = create_random_y_vector_mv(crsmat2, solution_x); exec_space().fence(); auto solution_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), solution_x); //Need to fence before reading from solution_host @@ -375,7 +348,7 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth scalar_view_t res_norms("Residuals", numVecs); auto h_res_norms = Kokkos::create_mirror_view(res_norms); - + for(int i = 0; i < 2; ++i) { if (i == 1) shmem_size = 2008; //make the shmem small on gpus so that it will test 2 level algorithm. @@ -518,6 +491,6 @@ TEST_F( TestCategory, sparse ## _ ## block_gauss_seidel_rank2 ## _ ## SCALAR ## EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) #endif - +#undef EXECUTE_TEST diff --git a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index cbdb673bb168..83823dc14153 100644 --- a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -60,20 +60,68 @@ #include "KokkosSparse_gauss_seidel.hpp" #include "KokkosSparse_partitioning_impl.hpp" #include "KokkosSparse_sor_sequential_impl.hpp" +#include "KokkosKernels_Sorting.hpp" +#include "KokkosKernels_TestUtils.hpp" -#ifndef kokkos_complex_double -#define kokkos_complex_double Kokkos::complex -#define kokkos_complex_float Kokkos::complex -#endif +// #ifndef kokkos_complex_double +// #define kokkos_complex_double Kokkos::complex +// #define kokkos_complex_float Kokkos::complex +// #endif + +typedef Kokkos::complex kokkos_complex_double; +typedef Kokkos::complex kokkos_complex_float; using namespace KokkosKernels; using namespace KokkosKernels::Experimental; using namespace KokkosSparse; using namespace KokkosSparse::Experimental; + + namespace Test { -template -int run_gauss_seidel( +//Run GS on the given vectors, where the handle is already set up. +template +void run_gauss_seidel( + Handle& kh, + crsMat_t input_mat, + vec_t x_vector, + vec_t y_vector, + bool is_symmetric_graph, + typename crsMat_t::value_type omega, + int apply_type = 0 // 0 for symmetric, 1 for forward, 2 for backward. + ) +{ + const size_t num_rows = input_mat.numRows(); + const size_t num_cols = input_mat.numCols(); + const int apply_count = 2; + + gauss_seidel_symbolic + (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, is_symmetric_graph); + gauss_seidel_numeric + (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, is_symmetric_graph); + + switch (apply_type){ + case 0: + symmetric_gauss_seidel_apply + (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); + break; + case 1: + forward_sweep_gauss_seidel_apply + (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); + break; + case 2: + backward_sweep_gauss_seidel_apply + (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); + break; + default: + symmetric_gauss_seidel_apply + (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); + break; + } +} + +template +void run_gauss_seidel( crsMat_t input_mat, GSAlgorithm gs_algorithm, vec_t x_vector, @@ -82,16 +130,12 @@ int run_gauss_seidel( int apply_type = 0, // 0 for symmetric, 1 for forward, 2 for backward. int cluster_size = 1, bool classic = false, // only with two-stage, true for sptrsv instead of richardson - ClusteringAlgorithm clusterAlgo = CLUSTER_DEFAULT) + ClusteringAlgorithm clusterAlgo = CLUSTER_DEFAULT) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type lno_view_t; - typedef typename graph_t::entries_type lno_nnz_view_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - - typedef typename lno_view_t::value_type size_type; - typedef typename lno_nnz_view_t::value_type lno_t; - typedef typename scalar_view_t::value_type scalar_t; + using size_type = typename crsMat_t::size_type; + using lno_t = typename crsMat_t::ordinal_type; + using scalar_t = typename crsMat_t::value_type; + using device = typename crsMat_t::device_type; typedef KokkosKernelsHandle ::one (); - omega = one; + omega = Kokkos::ArithTraits::one (); } } else kh.create_gs_handle(GS_DEFAULT); - const size_t num_rows_1 = input_mat.numRows(); - const size_t num_cols_1 = input_mat.numCols(); - //const int apply_count = 100; - const int apply_count = 1; - - gauss_seidel_symbolic - (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, is_symmetric_graph); - gauss_seidel_numeric - (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, is_symmetric_graph); + run_gauss_seidel(kh, input_mat, x_vector, y_vector, is_symmetric_graph, omega, apply_type); - switch (apply_type){ - case 0: - symmetric_gauss_seidel_apply - (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); - break; - case 1: - forward_sweep_gauss_seidel_apply - (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); - break; - case 2: - backward_sweep_gauss_seidel_apply - (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); - break; - default: - symmetric_gauss_seidel_apply - (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); - break; - } kh.destroy_gs_handle(); - return 0; -} - -template -vec_t create_x_vector(vec_t& kok_x, double max_value = 10.0) { - typedef typename vec_t::value_type scalar_t; - auto h_x = Kokkos::create_mirror_view (kok_x); - for (size_t j = 0; j < h_x.extent(1); ++j){ - for (size_t i = 0; i < h_x.extent(0); ++i){ - scalar_t r = - static_cast (rand()) / - static_cast (RAND_MAX / max_value); - h_x.access(i, j) = r; - } - } - Kokkos::deep_copy (kok_x, h_x); - return kok_x; } -template -vector_t create_y_vector(crsMat_t crsMat, vector_t x_vector){ - vector_t y_vector (Kokkos::ViewAllocateWithoutInitializing("Y VECTOR"), - crsMat.numRows()); - KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector); - return y_vector; -} - -template -vector_t create_y_vector_mv(crsMat_t crsMat, vector_t x_vector){ - vector_t y_vector (Kokkos::ViewAllocateWithoutInitializing("Y VECTOR"), - crsMat.numRows(), x_vector.extent(1)); - KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector); - return y_vector; -} -} - -template -crsMat_t symmetrize(crsMat_t A) -{ - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef typename graph_t::row_map_type::non_const_type lno_view_t; - typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - auto host_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); - auto host_entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); - auto host_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); - lno_t numRows = A.numRows(); - //symmetrize as input_mat + input_mat^T, to still have a diagonally dominant matrix - typedef std::map Row; - std::vector symRows(numRows); - for(lno_t r = 0; r < numRows; r++) - { - auto& row = symRows[r]; - for(size_type i = host_rowmap(r); i < host_rowmap(r + 1); i++) - { - lno_t c = host_entries(i); - auto& col = symRows[c]; - auto it = row.find(c); - if(it == row.end()) - row[c] = host_values(i); - else - row[c] += host_values(i); - it = col.find(r); - if(it == col.end()) - col[r] = host_values(i); - else - col[r] += host_values(i); - } - } - //Count entries - Kokkos::View new_host_rowmap("Rowmap", numRows + 1); - size_t accum = 0; - for(lno_t r = 0; r <= numRows; r++) - { - new_host_rowmap(r) = accum; - if(r < numRows) - accum += symRows[r].size(); - } - //Allocate new entries/values - Kokkos::View new_host_entries("Entries", accum); - Kokkos::View new_host_values("Values", accum); - for(lno_t r = 0; r < numRows; r++) - { - auto rowIt = symRows[r].begin(); - for(size_type i = new_host_rowmap(r); i < new_host_rowmap(r + 1); i++) - { - new_host_entries(i) = rowIt->first; - new_host_values(i) = rowIt->second; - rowIt++; - } - } - lno_view_t new_rowmap("Rowmap", numRows + 1); - lno_nnz_view_t new_entries("Entries", accum); - scalar_view_t new_values("Values", accum); - Kokkos::deep_copy(new_rowmap, new_host_rowmap); - Kokkos::deep_copy(new_entries, new_host_entries); - Kokkos::deep_copy(new_values, new_host_values); - return crsMat_t("SymA", numRows, numRows, accum, new_values, new_rowmap, new_entries); -} +} // namespace Test template void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool symmetric) @@ -259,13 +178,13 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ if(symmetric) { //Symmetrize on host, rather than relying on the parallel versions (those can be tested for symmetric=false) - input_mat = symmetrize(input_mat); + input_mat = Test::symmetrize(input_mat); } lno_t nv = input_mat.numRows(); scalar_view_t solution_x(Kokkos::ViewAllocateWithoutInitializing("X (correct)"), nv); - create_x_vector(solution_x); + create_random_x_vector(solution_x); mag_t initial_norm_res = KokkosBlas::nrm2(solution_x); - scalar_view_t y_vector = create_y_vector(input_mat, solution_x); + scalar_view_t y_vector = create_random_y_vector(input_mat, solution_x); //GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the behavior //of each algorithm _should be_ the same on every execution space, which is why //we just test GS_DEFAULT. @@ -278,7 +197,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ { Kokkos::Impl::Timer timer1; Kokkos::deep_copy(x_vector, zero); - run_gauss_seidel(input_mat, GS_DEFAULT, x_vector, y_vector, symmetric, apply_type); + run_gauss_seidel(input_mat, GS_DEFAULT, x_vector, y_vector, symmetric, apply_type); //double gs = timer1.seconds(); //KokkosKernels::Impl::print_1Dview(x_vector); KokkosBlas::axpby(one, solution_x, -one, x_vector); @@ -297,7 +216,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ Kokkos::Impl::Timer timer1; //Zero out X before solving Kokkos::deep_copy(x_vector, zero); - run_gauss_seidel( + run_gauss_seidel( input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], false, clusterAlgo); KokkosBlas::axpby(one, solution_x, -one, x_vector); mag_t result_norm_res = KokkosBlas::nrm2(x_vector); @@ -309,8 +228,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ for (int apply_type = 0; apply_type < apply_count; ++apply_type) { Kokkos::deep_copy(x_vector, zero); - run_gauss_seidel - (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type); + run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type); KokkosBlas::axpby(one, solution_x, -one, x_vector); mag_t result_norm_res = KokkosBlas::nrm2(x_vector); EXPECT_LT(result_norm_res, initial_norm_res); @@ -319,8 +237,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ for (int apply_type = 0; apply_type < apply_count; ++apply_type) { Kokkos::deep_copy(x_vector, zero); - run_gauss_seidel - (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true); + run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true); KokkosBlas::axpby(one, solution_x, -one, x_vector); mag_t result_norm_res = KokkosBlas::nrm2(x_vector); EXPECT_LT(result_norm_res, initial_norm_res); @@ -342,14 +259,14 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ if(symmetric) { //Symmetrize on host, rather than relying on the parallel versions (those can be tested for symmetric=false) - input_mat = symmetrize(input_mat); + input_mat = Test::symmetrize(input_mat); } lno_t nv = input_mat.numRows(); host_scalar_view2d_t solution_x(Kokkos::ViewAllocateWithoutInitializing("X (correct)"), nv, numVecs); - create_x_vector(solution_x); + create_random_x_vector(solution_x); scalar_view2d_t x_vector(Kokkos::ViewAllocateWithoutInitializing("X"), nv, numVecs); Kokkos::deep_copy(x_vector, solution_x); - scalar_view2d_t y_vector = create_y_vector_mv(input_mat, x_vector); + scalar_view2d_t y_vector = create_random_y_vector_mv(input_mat, x_vector); auto x_host = Kokkos::create_mirror_view(x_vector); std::vector initial_norms(numVecs); for(lno_t i = 0; i < numVecs; i++) @@ -370,8 +287,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ Kokkos::Impl::Timer timer1; //Zero out X before solving Kokkos::deep_copy(x_vector, zero); - run_gauss_seidel( - input_mat, GS_DEFAULT, x_vector, y_vector, symmetric, apply_type); + run_gauss_seidel(input_mat, GS_DEFAULT, x_vector, y_vector, symmetric, apply_type); Kokkos::deep_copy(x_host, x_vector); for(lno_t i = 0; i < numVecs; i++) { @@ -397,7 +313,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ Kokkos::Impl::Timer timer1; //Zero out X before solving Kokkos::deep_copy(x_vector, zero); - run_gauss_seidel( + run_gauss_seidel( input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], (ClusteringAlgorithm) algo); Kokkos::deep_copy(x_host, x_vector); for(lno_t i = 0; i < numVecs; i++) @@ -420,8 +336,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ { //Zero out X before solving Kokkos::deep_copy(x_vector, zero); - run_gauss_seidel - (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type); + run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type); Kokkos::deep_copy(x_host, x_vector); for(lno_t i = 0; i < numVecs; i++) { @@ -441,8 +356,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ { //Zero out X before solving Kokkos::deep_copy(x_vector, zero); - run_gauss_seidel - (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true); + run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true); Kokkos::deep_copy(x_host, x_vector); for(lno_t i = 0; i < numVecs; i++) { @@ -484,7 +398,7 @@ void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t ro //record the correct solution, to compare against at the end vector_t xgold("X gold", numRows); Kokkos::deep_copy(xgold, x); - vector_t y = Test::create_y_vector(input_mat, x); + vector_t y = Test::create_random_y_vector(input_mat, x); exec_space().fence(); auto y_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), y); //initial solution is zero @@ -610,6 +524,96 @@ void test_sgs_zero_rows() } } +template +void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows, lno_t nnzPerShortRow, bool symmetric) +{ + using namespace Test; + typedef typename KokkosSparse::CrsMatrix crsMat_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + typedef typename crsMat_t::index_type::non_const_type entries_view_t; + typedef typename crsMat_t::row_map_type::non_const_type rowmap_view_t; + typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + const scalar_t one = Kokkos::ArithTraits::one(); + srand(245); + std::vector rowmap = {0}; + std::vector entries; + std::vector values; + std::vector rowLengths; + for(lno_t i = 0; i < numRows; i++) + { + if(i < numLongRows) + rowLengths.push_back(numRows); + else + rowLengths.push_back(nnzPerShortRow); + } + std::random_shuffle(rowLengths.begin(), rowLengths.end()); + size_type totalEntries = 0; + int randSteps = 1000000; + scalar_t offDiagBase; + { + scalar_t unused; + Test::getRandomBounds(0.6, unused, offDiagBase); + } + for(lno_t i = 0; i < numRows; i++) + { + for(lno_t ent = 0; ent < rowLengths[i]; ent++) + { + if(ent == 0) + { + entries.push_back(i); + values.push_back(2.5 * one); + } + else + { + entries.push_back(rand() % numRows); + values.push_back((-0.3 + (0.6 * (rand() % randSteps) / randSteps)) * offDiagBase); + } + } + totalEntries += rowLengths[i]; + rowmap.push_back(totalEntries); + } + scalar_view_t valuesView(Kokkos::ViewAllocateWithoutInitializing("Values"), totalEntries); + entries_view_t entriesView(Kokkos::ViewAllocateWithoutInitializing("Entries"), totalEntries); + rowmap_view_t rowmapView(Kokkos::ViewAllocateWithoutInitializing("Rowmap"), numRows + 1); + Kokkos::deep_copy(valuesView, Kokkos::View(values.data(), totalEntries)); + Kokkos::deep_copy(entriesView, Kokkos::View(entries.data(), totalEntries)); + Kokkos::deep_copy(rowmapView, Kokkos::View(rowmap.data(), numRows + 1)); + crsMat_t input_mat("A", numRows, numRows, totalEntries, valuesView, rowmapView, entriesView); + input_mat = KokkosKernels::Impl::sort_and_merge_matrix(input_mat); + if(symmetric) + { + //Symmetrize on host, rather than relying on the parallel versions (those can be tested for symmetric=false) + input_mat = Test::symmetrize(input_mat); + } + lno_t nv = input_mat.numRows(); + scalar_view_t solution_x(Kokkos::ViewAllocateWithoutInitializing("X (correct)"), nv); + create_random_x_vector(solution_x); + mag_t initial_norm_res = KokkosBlas::nrm2(solution_x); + scalar_view_t y_vector = create_random_y_vector(input_mat, solution_x); + //GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the behavior + //of each algorithm _should be_ the same on every execution space, which is why + //we just test GS_DEFAULT. + int apply_count = 1; //test symmetric, forward, backward + scalar_view_t x_vector(Kokkos::ViewAllocateWithoutInitializing("x vector"), nv); + for (int apply_type = 0; apply_type < apply_count; ++apply_type) + { + typedef KokkosKernelsHandle + KernelHandle; + + KernelHandle kh; + kh.create_gs_handle(GS_DEFAULT); + auto gsHandle = kh.get_point_gs_handle(); + gsHandle->set_long_row_threshold(3 * nnzPerShortRow); + //Reset x vector to 0 + Kokkos::deep_copy(x_vector, scalar_t()); + run_gauss_seidel(kh, input_mat, x_vector, y_vector, symmetric, 0.9, apply_type); + KokkosBlas::axpby(one, solution_x, -one, x_vector); + mag_t result_norm_res = KokkosBlas::nrm2(x_vector); + EXPECT_LT(result_norm_res, 0.25 * initial_norm_res); + } +} + #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( TestCategory, sparse ## _ ## gauss_seidel_asymmetric_rank1 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ test_gauss_seidel_rank1(2000, 2000 * 20, 200, 10, false); \ @@ -631,7 +635,10 @@ TEST_F( TestCategory, sparse ## _ ## balloon_clustering ## _ ## SCALAR ## _ ## O } \ TEST_F( TestCategory, sparse ## _ ## sequential_sor ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ test_sequential_sor(1000, 1000 * 15, 50, 10); \ -} +} \ +TEST_F( TestCategory, sparse ## _ ## gauss_seidel_long_rows ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ + test_gauss_seidel_long_rows(500, 10, 20, true); \ +} \ #if (defined (KOKKOSKERNELS_INST_DOUBLE) \ && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ @@ -730,3 +737,4 @@ TEST_F( TestCategory, sparse ## _ ## sequential_sor ## _ ## SCALAR ## _ ## ORDIN EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) #endif +#undef EXECUTE_TEST diff --git a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_spmv.hpp b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_spmv.hpp index 5a033fdf344d..5d1b6305614a 100644 --- a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_spmv.hpp @@ -359,7 +359,7 @@ Kokkos::complex randomUpperBound>(int mag) } template -void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance){ +void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool heavy){ typedef typename KokkosSparse::CrsMatrix crsMat_t; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; @@ -390,24 +390,40 @@ void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_vari Kokkos::fill_random(input_xt,rand_pool,randomUpperBound(10)); Kokkos::fill_random(output_yt,rand_pool,randomUpperBound(10)); - std::vector nonTransModes = {'N', 'C'}; - std::vector transModes = {'T', 'H'}; + std::vector nonTransModes = {'N'}; + std::vector transModes = {'T'}; + std::vector testAlphaBeta = {0.0, 1.0}; + if(heavy) + { + nonTransModes.push_back('C'); + transModes.push_back('H'); + testAlphaBeta.push_back(-1.0); + testAlphaBeta.push_back(2.5); + } for(auto mode : nonTransModes) { - Test::check_spmv(input_mat, input_x, output_y, 1.0, 0.0, mode); - Test::check_spmv(input_mat, input_x, output_y, 0.0, 1.0, mode); - Test::check_spmv(input_mat, input_x, output_y, 1.0, 1.0, mode); + for(double alpha : testAlphaBeta) + { + for(double beta : testAlphaBeta) + { + Test::check_spmv(input_mat, input_x, output_y, alpha, beta, mode); + } + } } for(auto mode : transModes) { - Test::check_spmv(input_mat, input_xt, output_yt, 1.0, 0.0, mode); - Test::check_spmv(input_mat, input_xt, output_yt, 0.0, 1.0, mode); - Test::check_spmv(input_mat, input_xt, output_yt, 1.0, 1.0, mode); + for(double alpha : testAlphaBeta) + { + for(double beta : testAlphaBeta) + { + Test::check_spmv(input_mat, input_xt, output_yt, alpha, beta, mode); + } + } } } template -void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, int numMV){ +void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool heavy, int numMV){ lno_t numCols = numRows; typedef typename KokkosSparse::CrsMatrix crsMat_t; @@ -435,19 +451,35 @@ void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_v Kokkos::deep_copy(b_y_copy, b_y); Kokkos::deep_copy(b_yt_copy, b_yt); - std::vector nonTransModes = {'N', 'C'}; - std::vector transModes = {'T', 'H'}; + std::vector nonTransModes = {'N'}; + std::vector transModes = {'T'}; + std::vector testAlphaBeta = {0.0, 1.0}; + if(heavy) + { + nonTransModes.push_back('C'); + transModes.push_back('H'); + testAlphaBeta.push_back(-1.0); + testAlphaBeta.push_back(2.5); + } for(auto mode : nonTransModes) { - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, numMV, mode); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, numMV, mode); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, numMV, mode); + for(double alpha : testAlphaBeta) + { + for(double beta : testAlphaBeta) + { + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, numMV, mode); + } + } } for(auto mode : transModes) { - Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 1.0, 0.0, numMV, mode); - Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 0.0, 1.0, numMV, mode); - Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 1.0, 1.0, numMV, mode); + for(double alpha : testAlphaBeta) + { + for(double beta : testAlphaBeta) + { + Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, alpha, beta, numMV, mode); + } + } } } @@ -477,7 +509,19 @@ void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_ Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N'); Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T'); Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T'); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'T'); + //Testing all modes together, since matrix is square + std::vector modes = {'N', 'C', 'T', 'H'}; + std::vector testAlphaBeta = {0.0, 1.0, -1.0, 2.5}; + for(auto mode : modes) + { + for(double alpha : testAlphaBeta) + { + for(double beta : testAlphaBeta) + { + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, nv, mode); + } + } + } } } @@ -836,17 +880,23 @@ TEST_F( TestCategory,sparse ## _ ## spmv_issue_101 ## _ ## OFFSET ## _ ## DEVICE #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( TestCategory,sparse ## _ ## spmv ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ - test_spmv (50000, 50000 * 30, 200, 10); \ - test_spmv (50000, 50000 * 30, 100, 10); \ - test_spmv (10000, 10000 * 20, 100, 5); \ + test_spmv (1000, 1000 * 30, 200, 10, true); \ + test_spmv (1000, 1000 * 30, 100, 10, true); \ + test_spmv (1000, 1000 * 20, 100, 5, true); \ + test_spmv (50000, 50000 * 30, 200, 10, false); \ + test_spmv (50000, 50000 * 30, 100, 10, false); \ + test_spmv (10000, 10000 * 20, 100, 5, false); \ test_spmv_controls (10000, 10000 * 20, 100, 5); \ } #define EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ TEST_F( TestCategory,sparse ## _ ## spmv_mv ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## LAYOUT ## _ ## DEVICE ) { \ - test_spmv_mv (50000, 50000 * 30, 100, 10, 5); \ - test_spmv_mv (50000, 50000 * 30, 200, 10, 1); \ - test_spmv_mv (10000, 10000 * 20, 100, 5, 10); \ + test_spmv_mv (1000, 1000 * 30, 200, 10, true, 1); \ + test_spmv_mv (1000, 1000 * 30, 100, 10, true, 5); \ + test_spmv_mv (1000, 1000 * 20, 100, 5, true, 10); \ + test_spmv_mv (50000, 50000 * 30, 200, 10, false, 1); \ + test_spmv_mv (50000, 50000 * 30, 100, 10, false, 5); \ + test_spmv_mv (10000, 10000 * 20, 100, 5, false, 10); \ test_spmv_mv_heavy (200, 200 * 10, 60, 4, 30); \ } diff --git a/packages/kokkos/BUILD.md b/packages/kokkos/BUILD.md index e1f0e3e47276..bb1a31f266ec 100644 --- a/packages/kokkos/BUILD.md +++ b/packages/kokkos/BUILD.md @@ -262,6 +262,9 @@ Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_ * Kokkos_ARCH_ZEN2 * Whether to optimize for the Zen2 architecture * BOOL Default: OFF +* Kokkos_ARCH_ZEN3 + * Whether to optimize for the Zen3 architecture + * BOOL Default: OFF * Kokkos_ARCH_HSW * Whether to optimize for the HSW architecture * BOOL Default: OFF diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md index 5859fe32c432..7bb6de4cd924 100644 --- a/packages/kokkos/CHANGELOG.md +++ b/packages/kokkos/CHANGELOG.md @@ -1,8 +1,31 @@ # Change Log +## [3.4.01](https://github.com/kokkos/kokkos/tree/3.4.01) (2021-05-19) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.00...3.4.01) + +**Bug Fixes:** +- Windows: Remove atomic_compare_exchange_strong overload conflicts with Windows [\#4024](https://github.com/kokkos/kokkos/pull/4024) +- OpenMPTarget: Fixup allocation headers with OpenMPTarget backend [\#4020](https://github.com/kokkos/kokkos/pull/4020) +- OpenMPTarget: Add missing specailization for OMPT to Kokkos Random [\#4022](https://github.com/kokkos/kokkos/pull/4022) +- AMD: Add support for AMD Zen3 CPU architecture [\#4021](https://github.com/kokkos/kokkos/pull/4021) +- SYCL: Implement SYCL::print_configuration [\#4012](https://github.com/kokkos/kokkos/pull/4012) +- Containers: staticcsrgraph: use device type instead of execution space to construct views [\#3998](https://github.com/kokkos/kokkos/pull/3998) +- nvcc_wrapper: fix errors in argument handling, suppress duplicates of GPU architecture and RDC flags [\#4006](https://github.com/kokkos/kokkos/pull/4006) +- CI: Add icpx testing to intel container [\#4004](https://github.com/kokkos/kokkos/pull/4004) +- CMake/TRIBITS: Keep quoted compiler flags when passing to Trilinos [\#4007](https://github.com/kokkos/kokkos/pull/4007) +- CMake: Rename IntelClang to IntelLLVM [\#3945](https://github.com/kokkos/kokkos/pull/3945) + ## [3.4.00](https://github.com/kokkos/kokkos/tree/3.4.00) (2021-04-25) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.01...3.4.00) +**Highlights:** +- SYCL Backend Almost Feature Complete +- OpenMPTarget Backend Almost Feature Complete +- Performance Improvements for HIP backend +- Require CMake 3.16 or newer +- Tool Callback Interface Enhancements +- cmath wrapper functions available now in Kokkos::Experimental + **Features:** - Implement parallel_scan with ThreadVectorRange and Reducer [\#3861](https://github.com/kokkos/kokkos/pull/3861) - Implement SYCL Random [\#3849](https://github.com/kokkos/kokkos/pull/3849) @@ -48,7 +71,6 @@ - Change SYCLInternal::m_queue std::unique_ptr -> std::optional [\#3677](https://github.com/kokkos/kokkos/pull/3677) - Use alternative SYCL parallel_reduce implementation [\#3671](https://github.com/kokkos/kokkos/pull/3671) - Use runtime values in KokkosExp_MDRangePolicy.hpp [\#3626](https://github.com/kokkos/kokkos/pull/3626) -- Introduce KOKKOS_PRINTF [\#3615](https://github.com/kokkos/kokkos/pull/3615) - Clean up AnalyzePolicy [\#3564](https://github.com/kokkos/kokkos/pull/3564) - Changes for indirect launch of SYCL parallel reduce [\#3511](https://github.com/kokkos/kokkos/pull/3511) diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt index 6fc1bf7d2f7f..d154aebc289f 100644 --- a/packages/kokkos/CMakeLists.txt +++ b/packages/kokkos/CMakeLists.txt @@ -112,7 +112,7 @@ ENDIF() set(Kokkos_VERSION_MAJOR 3) set(Kokkos_VERSION_MINOR 4) -set(Kokkos_VERSION_PATCH 00) +set(Kokkos_VERSION_PATCH 01) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -206,8 +206,17 @@ ENDIF() IF (KOKKOS_HAS_TRILINOS) # Overwrite the old flags at the top-level # Because Tribits doesn't use lists, it uses spaces for the list of CXX flags - # we have to match the annoying behavior - STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS}") + # we have to match the annoying behavior, also we have to preserve quotes + # which needs another workaround. + IF (KOKKOS_ENABLE_SYCL) + SET(KOKKOS_COMPILE_OPTIONS_TMP) + FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS}) + LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP \"${OPTION}\") + ENDFOREACH() + STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}") + ELSE() + STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS}") + ENDIF() LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS}) IF (KOKKOS_ENABLE_CUDA) LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_CUDA_OPTIONS}) diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos index 2599121d70ad..bda8572073a3 100644 --- a/packages/kokkos/Makefile.kokkos +++ b/packages/kokkos/Makefile.kokkos @@ -2,7 +2,7 @@ KOKKOS_VERSION_MAJOR = 3 KOKKOS_VERSION_MINOR = 4 -KOKKOS_VERSION_PATCH = 00 +KOKKOS_VERSION_PATCH = 01 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,OpenMP,Pthread,Serial @@ -14,7 +14,7 @@ KOKKOS_DEVICES ?= "Pthread" # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: BGQ,Power7,Power8,Power9 # AMD-GPUS: Vega900,Vega906,Vega908 -# AMD-CPUS: AMDAVX,Zen,Zen2 +# AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" @@ -372,6 +372,7 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ # AMD based. KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX) +KOKKOS_INTERNAL_USE_ARCH_ZEN3 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen3) KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2) KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900) @@ -381,12 +382,12 @@ KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega # Any AVX? KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) -KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) +KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL)) KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX)) # Decide what ISA level we are able to support. -KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) +KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) KOKKOS_INTERNAL_USE_ISA_KNC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC)) KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9)) KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7)) @@ -780,6 +781,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN3") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -mavx2 + KOKKOS_LDFLAGS += -mavx2 + else + KOKKOS_CXXFLAGS += -march=znver3 -mtune=znver3 + KOKKOS_LDFLAGS += -march=znver3 -mtune=znver3 + endif +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX") diff --git a/packages/kokkos/algorithms/CMakeLists.txt b/packages/kokkos/algorithms/CMakeLists.txt index fd099054ba45..4df76a1dbbd1 100644 --- a/packages/kokkos/algorithms/CMakeLists.txt +++ b/packages/kokkos/algorithms/CMakeLists.txt @@ -5,10 +5,12 @@ KOKKOS_SUBPACKAGE(Algorithms) IF (NOT Kokkos_INSTALL_TESTING) ADD_SUBDIRECTORY(src) ENDIF() - -KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) +IF(NOT (KOKKOS_ENABLE_OPENMPTARGET + AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR + KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))) + KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) +ENDIF() KOKKOS_SUBPACKAGE_POSTPROCESS() - diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp index 904cf5ccb967..55ce19971faf 100644 --- a/packages/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp @@ -687,6 +687,24 @@ struct Random_UniqueIndex { }; #endif +#ifdef KOKKOS_ENABLE_OPENMPTARGET +template <> +struct Random_UniqueIndex { + using locks_view_type = View; + KOKKOS_FUNCTION + static int get_state_idx(const locks_view_type& locks) { + const int team_size = omp_get_num_threads(); + int i = omp_get_team_num() * team_size + omp_get_thread_num(); + const int lock_size = locks.extent_int(0); + + while (Kokkos::atomic_compare_exchange(&locks(i), 0, 1)) { + i = (i + 1) % lock_size; + } + return i; + } +}; +#endif + } // namespace Impl template diff --git a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt index 9109837985a9..50f8f0a332a6 100644 --- a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -44,7 +44,7 @@ IF(Kokkos_ENABLE_OPENMP) ) ENDIF() -foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL) +foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # Because there is always an exception to the rule if(Tag STREQUAL "Threads") set(DEVICE "PTHREAD") diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp index 1f14875096dd..c37e779c9927 100644 --- a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -109,6 +109,16 @@ struct RandomProperties { } }; +// FIXME_OPENMPTARGET: Need this for OpenMPTarget because contra to the standard +// llvm requires the binary operator defined not just the += +KOKKOS_INLINE_FUNCTION +RandomProperties operator+(const RandomProperties& org, + const RandomProperties& add) { + RandomProperties val = org; + val += add; + return val; +} + template struct test_random_functor { using rnd_type = typename GeneratorPool::generator_type; diff --git a/packages/kokkos/algorithms/unit_tests/TestSort.hpp b/packages/kokkos/algorithms/unit_tests/TestSort.hpp index a3c362ec201b..9c6308c84347 100644 --- a/packages/kokkos/algorithms/unit_tests/TestSort.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestSort.hpp @@ -370,7 +370,10 @@ template void test_sort(unsigned int N) { test_1D_sort(N); test_3D_sort(N); +// FIXME_OPENMPTARGET: OpenMPTarget doesn't support DynamicView yet. +#ifndef KOKKOS_ENABLE_OPENMPTARGET test_dynamic_view_sort(N); +#endif test_issue_1160_sort(); } } // namespace Impl diff --git a/packages/kokkos/bin/nvcc_wrapper b/packages/kokkos/bin/nvcc_wrapper index 5556e888e34b..4e52e4d09f4f 100755 --- a/packages/kokkos/bin/nvcc_wrapper +++ b/packages/kokkos/bin/nvcc_wrapper @@ -67,6 +67,11 @@ shared_versioned_libraries="" # Does the User set the architecture arch_set=0 +arch_flag="" + +# Does the user set RDC? +rdc_set=0 +rdc_flag="" # Does the user overwrite the host compiler ccbin_set=0 @@ -190,8 +195,34 @@ do host_only_args="$host_only_args $1 $2" shift ;; + # Handle nvcc args controlling whether to generated relocatable device code + --relocatable-device-code=*|-rdc=*) + if [ "$rdc_set" -eq 0 ]; then + rdc_set=1 + rdc_flag="$1" + cuda_args="$cuda_args $rdc_flag" + elif [ "$rdc_flag" != "$1" ]; then + echo "RDC is being set twice with different flags, which is not handled" + echo "$rdc_flag" + echo "$1" + exit 1 + fi + ;; + -rdc) + if [ "$rdc_set" -eq 0 ]; then + rdc_set=1 + rdc_flag="$1 $2" + cuda_args="$cuda_args $rdc_flag" + shift + elif [ "$rdc_flag" != "$1 $2" ]; then + echo "RDC is being set twice with different flags, which is not handled" + echo "$rdc_flag" + echo "$1 $2" + exit 1 + fi + ;; #Handle known nvcc args - --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|--verbose|--keep|--keep-dir*|-G|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad=*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args @@ -199,13 +230,13 @@ do cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument - -rdc|-maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include) + -maxrregcount=*|--maxrregcount=*) + cuda_args="$cuda_args $1" + ;; + -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include) cuda_args="$cuda_args $1 $2" shift ;; - -rdc=*|-maxrregcount*|--maxrregcount*) - cuda_args="$cuda_args $1" - ;; #Handle unsupported standard flags --std=c++1y|-std=c++1y|--std=gnu++1y|-std=gnu++1y|--std=c++1z|-std=c++1z|--std=gnu++1z|-std=gnu++1z|--std=c++2a|-std=c++2a) fallback_std_flag="-std=c++14" @@ -323,20 +354,36 @@ do ;; #Handle -arch argument (if its not set use a default) this is the version with = sign - -arch*|-gencode*) - cuda_args="$cuda_args $1" - arch_set=1 + -arch=*|-gencode=*) + if [ "$arch_set" -eq 0 ]; then + arch_set=1 + arch_flag="$1" + cuda_args="$cuda_args $arch_flag" + elif [ "$arch_flag" != "$1" ]; then + echo "ARCH is being set twice with different flags, which is not handled" + echo "$arch_flag" + echo "$1" + exit 1 + fi + ;; + #Handle -arch argument (if its not set use a default) this is the version without = sign + -arch|-gencode) + if [ "$arch_set" -eq 0 ]; then + arch_set=1 + arch_flag="$1 $2" + cuda_args="$cuda_args $arch_flag" + shift + elif [ "$arch_flag" != "$1 $2" ]; then + echo "ARCH is being set twice with different flags, which is not handled" + echo "$arch_flag" + echo "$1 $2" + exit 1 + fi ;; #Handle -code argument (if its not set use a default) this is the version with = sign -code*) cuda_args="$cuda_args $1" ;; - #Handle -arch argument (if its not set use a default) this is the version without = sign - -arch|-gencode) - cuda_args="$cuda_args $1 $2" - arch_set=1 - shift - ;; #Handle -code argument (if its not set use a default) this is the version without = sign -code) cuda_args="$cuda_args $1 $2" diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in index fbfae3711ec1..3455b0cb42e7 100644 --- a/packages/kokkos/cmake/KokkosCore_config.h.in +++ b/packages/kokkos/cmake/KokkosCore_config.h.in @@ -99,5 +99,6 @@ #cmakedefine KOKKOS_ARCH_AMPERE86 #cmakedefine KOKKOS_ARCH_AMD_ZEN #cmakedefine KOKKOS_ARCH_AMD_ZEN2 +#cmakedefine KOKKOS_ARCH_AMD_ZEN3 #cmakedefine KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake index ec18e70a36a3..e8b85542c633 100644 --- a/packages/kokkos/cmake/kokkos_arch.cmake +++ b/packages/kokkos/cmake/kokkos_arch.cmake @@ -63,6 +63,7 @@ KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0") KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6") KOKKOS_ARCH_OPTION(ZEN HOST "AMD Zen architecture") KOKKOS_ARCH_OPTION(ZEN2 HOST "AMD Zen2 architecture") +KOKKOS_ARCH_OPTION(ZEN3 HOST "AMD Zen3 architecture") KOKKOS_ARCH_OPTION(VEGA900 GPU "AMD GPU MI25 GFX900") KOKKOS_ARCH_OPTION(VEGA906 GPU "AMD GPU MI50/MI60 GFX906") KOKKOS_ARCH_OPTION(VEGA908 GPU "AMD GPU MI100 GFX908") @@ -215,6 +216,15 @@ IF (KOKKOS_ARCH_ZEN2) SET(KOKKOS_ARCH_AMD_AVX2 ON) ENDIF() +IF (KOKKOS_ARCH_ZEN3) + COMPILER_SPECIFIC_FLAGS( + Intel -mavx2 + DEFAULT -march=znver3 -mtune=znver3 + ) + SET(KOKKOS_ARCH_AMD_ZEN3 ON) + SET(KOKKOS_ARCH_AMD_AVX2 ON) +ENDIF() + IF (KOKKOS_ARCH_WSM) COMPILER_SPECIFIC_FLAGS( Intel -xSSE4.2 @@ -284,7 +294,7 @@ IF (KOKKOS_ARCH_SKX) ) ENDIF() -IF (KOKKOS_ARCH_WSM OR KOKKOS_ARCH_SNB OR KOKKOS_ARCH_HSW OR KOKKOS_ARCH_BDW OR KOKKOS_ARCH_KNL OR KOKKOS_ARCH_SKX OR KOKKOS_ARCH_ZEN OR KOKKOS_ARCH_ZEN2) +IF (KOKKOS_ARCH_WSM OR KOKKOS_ARCH_SNB OR KOKKOS_ARCH_HSW OR KOKKOS_ARCH_BDW OR KOKKOS_ARCH_KNL OR KOKKOS_ARCH_SKX OR KOKKOS_ARCH_ZEN OR KOKKOS_ARCH_ZEN2 OR KOKKOS_ARCH_ZEN3) SET(KOKKOS_USE_ISA_X86_64 ON) ENDIF() @@ -457,7 +467,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) ENDIF() IF (KOKKOS_ARCH_INTEL_GEN) COMPILER_SPECIFIC_FLAGS( - IntelClang -fopenmp-targets=spir64 -D__STRICT_ANSI__ + IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ ) ENDIF() ENDIF() diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake index 4434d6928f46..23847263a952 100644 --- a/packages/kokkos/cmake/kokkos_compiler_id.cmake +++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake @@ -101,7 +101,7 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) OUTPUT_STRIP_TRAILING_WHITESPACE) IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang SET(KOKKOS_CLANG_IS_INTEL TRUE) - SET(KOKKOS_CXX_COMPILER_ID IntelClang CACHE STRING INTERNAL FORCE) + SET(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) ENDIF() ENDIF() diff --git a/packages/kokkos/cmake/kokkos_enable_devices.cmake b/packages/kokkos/cmake/kokkos_enable_devices.cmake index 445dad47ce56..d7f83ddbdf87 100644 --- a/packages/kokkos/cmake/kokkos_enable_devices.cmake +++ b/packages/kokkos/cmake/kokkos_enable_devices.cmake @@ -61,7 +61,7 @@ IF(KOKKOS_ENABLE_OPENMP) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Clang -Xcompiler ${ClangOpenMPFlag} - IntelClang -Xcompiler -fiopenmp + IntelLLVM -Xcompiler -fiopenmp PGI -Xcompiler -mp Cray NO-VALUE-SPECIFIED XL -Xcompiler -qsmp=omp @@ -70,7 +70,7 @@ IF(KOKKOS_ENABLE_OPENMP) ELSE() COMPILER_SPECIFIC_FLAGS( Clang ${ClangOpenMPFlag} - IntelClang -fiopenmp + IntelLLVM -fiopenmp AppleClang -Xpreprocessor -fopenmp PGI -mp Cray NO-VALUE-SPECIFIED @@ -92,7 +92,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) COMPILER_SPECIFIC_FLAGS( Clang ${ClangOpenMPFlag} -Wno-openmp-mapping - IntelClang -fiopenmp -Wno-openmp-mapping + IntelLLVM -fiopenmp -Wno-openmp-mapping XL -qsmp=omp -qoffload -qnoeh PGI -mp=gpu DEFAULT -fopenmp diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake index 858322394d7a..e1a3e5f8bd00 100644 --- a/packages/kokkos/cmake/kokkos_functions.cmake +++ b/packages/kokkos/cmake/kokkos_functions.cmake @@ -773,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET) ENDFUNCTION() FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIPCC Fujitsu) + SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu) CMAKE_PARSE_ARGUMENTS( PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" diff --git a/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp b/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp index f22e5d1eca92..00d3eafd231e 100644 --- a/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp +++ b/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp @@ -114,15 +114,11 @@ namespace Kokkos { template inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph( const std::string& label, const std::vector& input) { - using output_type = StaticCrsGraphType; - // using input_type = std::vector; // unused - + using output_type = StaticCrsGraphType; using entries_type = typename output_type::entries_type; - - using work_type = View; + using work_type = View< + typename output_type::size_type[], typename output_type::array_layout, + typename output_type::device_type, typename output_type::memory_traits>; output_type output; @@ -161,10 +157,9 @@ inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph( static_assert(entries_type::rank == 1, "Graph entries view must be rank one"); - using work_type = View; + using work_type = View< + typename output_type::size_type[], typename output_type::array_layout, + typename output_type::device_type, typename output_type::memory_traits>; output_type output; diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp index dc5e0194ab0a..58d723ac110a 100644 --- a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp @@ -179,8 +179,6 @@ class SharedAllocationRecord const RecordBase::function_type arg_dealloc = &deallocate); public: - std::string get_label() const; - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( const Kokkos::Experimental::OpenMPTargetSpace& arg_space, const std::string& arg_label, const size_t arg_alloc_size) { @@ -190,10 +188,6 @@ class SharedAllocationRecord return nullptr; #endif } - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); }; } // namespace Impl diff --git a/packages/kokkos/core/src/Kokkos_SYCL.hpp b/packages/kokkos/core/src/Kokkos_SYCL.hpp index aa720371df73..8ee76b43862f 100644 --- a/packages/kokkos/core/src/Kokkos_SYCL.hpp +++ b/packages/kokkos/core/src/Kokkos_SYCL.hpp @@ -113,7 +113,7 @@ class SYCL { void fence() const; /// \brief Print configuration information to the given output stream. - static void print_configuration(std::ostream&, const bool detail = false); + void print_configuration(std::ostream&, const bool detail = false); /// \brief Free any resources being consumed by the device. static void impl_finalize(); @@ -131,12 +131,10 @@ class SYCL { sycl::device get_device() const; friend std::ostream& operator<<(std::ostream& os, const SYCLDevice& that) { - return that.info(os); + return SYCL::impl_sycl_info(os, that.m_device); } private: - std::ostream& info(std::ostream& os) const; - sycl::device m_device; }; @@ -154,6 +152,9 @@ class SYCL { } private: + static std::ostream& impl_sycl_info(std::ostream& os, + const sycl::device& device); + Kokkos::Impl::HostSharedPtr m_space_instance; }; diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp index 6fbb4245b8fb..b99b0017ca17 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp @@ -107,12 +107,6 @@ SharedAllocationRecord::m_alloc_size); } -// TODO: Implement deep copy back see CudaSpace -std::string SharedAllocationRecord::get_label() const { - return std::string("OpenMPTargetAllocation"); -} - SharedAllocationRecord:: SharedAllocationRecord( const Kokkos::Experimental::OpenMPTargetSpace &arg_space, @@ -141,23 +135,6 @@ SharedAllocationRecord:: //---------------------------------------------------------------------------- -void *SharedAllocationRecord:: - reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - // Kokkos::Impl::DeepCopy( r_new->data() - // , r_old->data() - // , std::min( r_old->size() , - // r_new->size() ) ); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - } // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index 9c29eb190d17..3a09ee919540 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -105,6 +105,12 @@ bool SYCL::impl_is_initialized() { void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); } +void SYCL::print_configuration(std::ostream& s, const bool detailed) { + s << "macro KOKKOS_ENABLE_SYCL : defined" << '\n'; + if (detailed) + SYCL::impl_sycl_info(s, m_space_instance->m_queue->get_device()); +} + void SYCL::fence() const { Impl::SYCLInternal::fence(*m_space_instance->m_queue); } @@ -143,119 +149,118 @@ void SYCL::impl_initialize(SYCL::SYCLDevice d) { Impl::SYCLInternal::singleton().initialize(d.get_device()); } -std::ostream& SYCL::SYCLDevice::info(std::ostream& os) const { +std::ostream& SYCL::impl_sycl_info(std::ostream& os, + const sycl::device& device) { using namespace sycl::info; - return os << "Name: " << m_device.get_info() - << "\nDriver Version: " - << m_device.get_info() - << "\nIs Host: " << m_device.is_host() - << "\nIs CPU: " << m_device.is_cpu() - << "\nIs GPU: " << m_device.is_gpu() - << "\nIs Accelerator: " << m_device.is_accelerator() - << "\nVendor Id: " << m_device.get_info() + return os << "Name: " << device.get_info() + << "\nDriver Version: " << device.get_info() + << "\nIs Host: " << device.is_host() + << "\nIs CPU: " << device.is_cpu() + << "\nIs GPU: " << device.is_gpu() + << "\nIs Accelerator: " << device.is_accelerator() + << "\nVendor Id: " << device.get_info() << "\nMax Compute Units: " - << m_device.get_info() + << device.get_info() << "\nMax Work Item Dimensions: " - << m_device.get_info() + << device.get_info() << "\nMax Work Group Size: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Char: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Short: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Int: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Long: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Float: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Double: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Half: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Char: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Short: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Int: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Long: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Float: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Double: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Half: " - << m_device.get_info() - << "\nAddress Bits: " << m_device.get_info() - << "\nImage Support: " << m_device.get_info() + << device.get_info() + << "\nAddress Bits: " << device.get_info() + << "\nImage Support: " << device.get_info() << "\nMax Mem Alloc Size: " - << m_device.get_info() + << device.get_info() << "\nMax Read Image Args: " - << m_device.get_info() + << device.get_info() << "\nImage2d Max Width: " - << m_device.get_info() + << device.get_info() << "\nImage2d Max Height: " - << m_device.get_info() + << device.get_info() << "\nImage3d Max Width: " - << m_device.get_info() + << device.get_info() << "\nImage3d Max Height: " - << m_device.get_info() + << device.get_info() << "\nImage3d Max Depth: " - << m_device.get_info() + << device.get_info() << "\nImage Max Buffer Size: " - << m_device.get_info() + << device.get_info() << "\nImage Max Array Size: " - << m_device.get_info() - << "\nMax Samplers: " << m_device.get_info() + << device.get_info() + << "\nMax Samplers: " << device.get_info() << "\nMax Parameter Size: " - << m_device.get_info() + << device.get_info() << "\nMem Base Addr Align: " - << m_device.get_info() + << device.get_info() << "\nGlobal Cache Mem Line Size: " - << m_device.get_info() + << device.get_info() << "\nGlobal Mem Cache Size: " - << m_device.get_info() + << device.get_info() << "\nGlobal Mem Size: " - << m_device.get_info() + << device.get_info() << "\nMax Constant Buffer Size: " - << m_device.get_info() + << device.get_info() << "\nMax Constant Args: " - << m_device.get_info() - << "\nLocal Mem Size: " - << m_device.get_info() + << device.get_info() + << "\nLocal Mem Size: " << device.get_info() << "\nError Correction Support: " - << m_device.get_info() + << device.get_info() << "\nHost Unified Memory: " - << m_device.get_info() + << device.get_info() << "\nProfiling Timer Resolution: " - << m_device.get_info() + << device.get_info() << "\nIs Endian Little: " - << m_device.get_info() - << "\nIs Available: " << m_device.get_info() + << device.get_info() + << "\nIs Available: " << device.get_info() << "\nIs Compiler Available: " - << m_device.get_info() + << device.get_info() << "\nIs Linker Available: " - << m_device.get_info() + << device.get_info() << "\nQueue Profiling: " - << m_device.get_info() + << device.get_info() << "\nBuilt In Kernels: " << Container>( - m_device.get_info()) - << "\nVendor: " << m_device.get_info() - << "\nProfile: " << m_device.get_info() - << "\nVersion: " << m_device.get_info() + device.get_info()) + << "\nVendor: " << device.get_info() + << "\nProfile: " << device.get_info() + << "\nVersion: " << device.get_info() << "\nExtensions: " << Container>( - m_device.get_info()) + device.get_info()) << "\nPrintf Buffer Size: " - << m_device.get_info() + << device.get_info() << "\nPreferred Interop User Sync: " - << m_device.get_info() + << device.get_info() << "\nPartition Max Sub Devices: " - << m_device.get_info() + << device.get_info() << "\nReference Count: " - << m_device.get_info() << '\n'; + << device.get_info() << '\n'; } namespace Impl { @@ -293,15 +298,13 @@ void SYCLSpaceInitializer::fence() { } void SYCLSpaceInitializer::print_configuration(std::ostream& msg, - const bool /*detail*/) { + const bool detail) { msg << "Devices:" << std::endl; msg << " KOKKOS_ENABLE_SYCL: "; msg << "yes" << std::endl; msg << "\nRuntime Configuration:" << std::endl; - // FIXME_SYCL not implemented - std::abort(); - // Experimental::SYCL::print_configuration(msg, detail); + Experimental::SYCL{}.print_configuration(msg, detail); } } // namespace Impl diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp index 3f2e8914ea93..2f824566b804 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp @@ -152,12 +152,6 @@ inline T atomic_compare_exchange( ((LONGLONG*)&compare_and_result)); return compare_and_result; } - -template -inline T atomic_compare_exchange_strong(volatile T* const dest, - const T& compare, const T& val) { - return atomic_compare_exchange(dest, compare, val); -} #endif } // namespace Kokkos diff --git a/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash b/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash index 5ff781b96fc0..8fe8e2b5ecea 100755 --- a/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash +++ b/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash @@ -4,7 +4,7 @@ HostArch=(SNB HSW SKX KNL) DeviceArch=(Kepler35 Kepler37 Pascal60 Pascal61 Volta70) if [ ! -z "$KOKKOS_HOST_ARCH_TEST" ]; then export KOKKOS_ARCH_TEST=1 - HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 ARMv8_ThunderX ARMv8_ThunderX2) + HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) DeviceArch=() fi diff --git a/packages/kokkos/generate_makefile.bash b/packages/kokkos/generate_makefile.bash index e9871b436971..c601e0ee161f 100755 --- a/packages/kokkos/generate_makefile.bash +++ b/packages/kokkos/generate_makefile.bash @@ -157,6 +157,7 @@ display_help_text() { echo " AMDAVX = AMD CPU" echo " ZEN = AMD Zen-Core CPU" echo " ZEN2 = AMD Zen2-Core CPU" + echo " ZEN3 = AMD Zen3-Core CPU" echo " [AMD: GPU]" echo " VEGA900 = AMD GPU MI25 GFX900" echo " VEGA906 = AMD GPU MI50/MI60 GFX906" diff --git a/packages/kokkos/gnu_generate_makefile.bash b/packages/kokkos/gnu_generate_makefile.bash index ea509669f068..8a463270c855 100755 --- a/packages/kokkos/gnu_generate_makefile.bash +++ b/packages/kokkos/gnu_generate_makefile.bash @@ -137,6 +137,7 @@ do echo " AMDAVX = AMD CPU" echo " ZEN = AMD Zen-Core CPU" echo " ZEN2 = AMD Zen2-Core CPU" + echo " ZEN3 = AMD Zen3-Core CPU" echo " [ARM]" echo " ARMv80 = ARMv8.0 Compatible CPU" echo " ARMv81 = ARMv8.1 Compatible CPU" diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt index e746bd7d0103..be8a5e7da5f4 100644 --- a/packages/kokkos/master_history.txt +++ b/packages/kokkos/master_history.txt @@ -23,3 +23,5 @@ tag: 3.1.01 date: 05:04:2020 master: 785d19f2 release: 2be028bc tag: 3.2.00 date: 08:19:2020 master: 3b2fdc7e release: 5dc6d303 tag: 3.3.00 date: 12:16:2020 master: 734f577a release: 1535ba5c tag: 3.3.01 date: 01:06:2021 master: 6d65b5a3 release: 4d23839c +tag: 3.4.00 date: 04:26:2021 master: 1fb0c284 release: 5d7738d6 +tag: 3.4.01 date: 05:20:2021 master: 4b97a22f release: 410b15c8 diff --git a/packages/kokkos/scripts/testing_scripts/generate_makefile.bash b/packages/kokkos/scripts/testing_scripts/generate_makefile.bash index f21124ed6e71..ff9620efa689 100755 --- a/packages/kokkos/scripts/testing_scripts/generate_makefile.bash +++ b/packages/kokkos/scripts/testing_scripts/generate_makefile.bash @@ -129,6 +129,7 @@ do echo " AMDAVX = AMD CPU" echo " ZEN = AMD Zen-Core CPU" echo " ZEN2 = AMD Zen2-Core CPU" + echo " ZEN3 = AMD Zen3-Core CPU" echo " [ARM]" echo " ARMv80 = ARMv8.0 Compatible CPU" echo " ARMv81 = ARMv8.1 Compatible CPU" diff --git a/packages/muelu/adapters/xpetra/MueLu_RefMaxwell_def.hpp b/packages/muelu/adapters/xpetra/MueLu_RefMaxwell_def.hpp index 2c26f75623cd..dbf655c5e2ac 100644 --- a/packages/muelu/adapters/xpetra/MueLu_RefMaxwell_def.hpp +++ b/packages/muelu/adapters/xpetra/MueLu_RefMaxwell_def.hpp @@ -109,7 +109,8 @@ #include // Stratimikos includes #include -#include +#include +#include "Teuchos_AbstractFactoryStd.hpp" // Ifpack2 includes #ifdef HAVE_MUELU_IFPACK2 #include @@ -171,34 +172,6 @@ namespace MueLu { FindNonZeros(myColsToZero->getData(0),dirichletCols); } - - template - void ApplyRowSumCriterion(const Xpetra::Matrix& A, - const typename Teuchos::ScalarTraits::magnitudeType rowSumTol, - Teuchos::ArrayRCP& dirichletRows) - { - typedef Teuchos::ScalarTraits STS; - RCP> rowmap = A.getRowMap(); - for (LocalOrdinal row = 0; row < Teuchos::as(rowmap->getNodeNumElements()); ++row) { - size_t nnz = A.getNumEntriesInLocalRow(row); - ArrayView indices; - ArrayView vals; - A.getLocalRowView(row, indices, vals); - - Scalar rowsum = STS::zero(); - Scalar diagval = STS::zero(); - for (LocalOrdinal colID = 0; colID < Teuchos::as(nnz); colID++) { - LocalOrdinal col = indices[colID]; - if (row == col) - diagval = vals[colID]; - rowsum += vals[colID]; - } - if (STS::real(rowsum) > STS::magnitude(diagval) * rowSumTol) - dirichletRows[row] = true; - } - } - - #ifdef HAVE_MUELU_KOKKOS_REFACTOR template @@ -259,33 +232,6 @@ namespace MueLu { FindNonZeros(myColsToZero->getDeviceLocalView(),dirichletCols); } - - template - void ApplyRowSumCriterion(const Xpetra::Matrix& A, - const typename Teuchos::ScalarTraits::magnitudeType rowSumTol, - Kokkos::View & dirichletRows) - { - typedef Teuchos::ScalarTraits STS; - RCP> rowmap = A.getRowMap(); - for (LocalOrdinal row = 0; row < Teuchos::as(rowmap->getNodeNumElements()); ++row) { - size_t nnz = A.getNumEntriesInLocalRow(row); - ArrayView indices; - ArrayView vals; - A.getLocalRowView(row, indices, vals); - - Scalar rowsum = STS::zero(); - Scalar diagval = STS::zero(); - for (LocalOrdinal colID = 0; colID < Teuchos::as(nnz); colID++) { - LocalOrdinal col = indices[colID]; - if (row == col) - diagval = vals[colID]; - rowsum += vals[colID]; - } - if (STS::real(rowsum) > STS::magnitude(diagval) * rowSumTol) - dirichletRows(row) = true; - } - } - #endif template @@ -416,7 +362,7 @@ namespace MueLu { BCrowsKokkos_ = Utilities_kokkos::DetectDirichletRows(*SM_Matrix_,Teuchos::ScalarTraits::eps(),/*count_twos_as_dirichlet=*/true); if (rowSumTol > 0.) - ApplyRowSumCriterion(*SM_Matrix_, rowSumTol, BCrowsKokkos_); + Utilities_kokkos::ApplyRowSumCriterion(*SM_Matrix_, rowSumTol, BCrowsKokkos_); BCcolsKokkos_ = Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("dirichletCols"), D0_Matrix_->getColMap()->getNodeNumElements()); BCdomainKokkos_ = Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("dirichletCols"), D0_Matrix_->getDomainMap()->getNodeNumElements()); @@ -438,7 +384,7 @@ namespace MueLu { BCrows_ = Teuchos::arcp_const_cast(Utilities::DetectDirichletRows(*SM_Matrix_,Teuchos::ScalarTraits::eps(),/*count_twos_as_dirichlet=*/true)); if (rowSumTol > 0.) - ApplyRowSumCriterion(*SM_Matrix_, rowSumTol, BCrows_); + Utilities::ApplyRowSumCriterion(*SM_Matrix_, rowSumTol, BCrows_); BCcols_.resize(D0_Matrix_->getColMap()->getNodeNumElements()); BCdomain_.resize(D0_Matrix_->getDomainMap()->getNodeNumElements()); @@ -2408,7 +2354,7 @@ namespace MueLu { AHBCrows[i*dim+k] = BCdomain_[i]; magnitudeType rowSumTol = parameterList_.get("refmaxwell: row sum drop tol (1,1)",-1.0); if (rowSumTol > 0.) - ApplyRowSumCriterion(*AH_, rowSumTol, AHBCrows); + Utilities::ApplyRowSumCriterion(*AH_, rowSumTol, AHBCrows); if (applyBCsToH_) Utilities::ApplyOAZToMatrixRows(AH_, AHBCrows); } @@ -3010,10 +2956,11 @@ namespace MueLu { RCP > thyraA = Xpetra::ThyraUtils::toThyra(Teuchos::rcp_dynamic_cast>(A)->getCrsMatrix()); Stratimikos::DefaultLinearSolverBuilder linearSolverBuilder; - Stratimikos::enableMueLu(linearSolverBuilder); + typedef Thyra::PreconditionerFactoryBase Base; + typedef Thyra::MueLuPreconditionerFactory ImplMueLu; + linearSolverBuilder.setPreconditioningStrategyFactory(Teuchos::abstractFactoryStd(), "MueLu"); #ifdef HAVE_MUELU_IFPACK2 // Register Ifpack2 as a Stratimikos preconditioner strategy. - typedef Thyra::PreconditionerFactoryBase Base; typedef Thyra::Ifpack2PreconditionerFactory > Impl; linearSolverBuilder.setPreconditioningStrategyFactory(Teuchos::abstractFactoryStd(), "Ifpack2"); #endif diff --git a/packages/muelu/doc/UsersGuide/masterList.xml b/packages/muelu/doc/UsersGuide/masterList.xml index a36a7e2ba1f7..efdcf0a2371e 100644 --- a/packages/muelu/doc/UsersGuide/masterList.xml +++ b/packages/muelu/doc/UsersGuide/masterList.xml @@ -411,6 +411,24 @@ parameter not existing in ML + + aggregation: classical scheme + string + "direct" + Prolongator formation option for classical coarsening. + false + parameter not existing in ML + + + + aggregation: row sum drop tol + double + -1.0 + Detection threshold for mass-dominated rows. Defaults to -1 (meaning disabled) + false + parameter not existing in ML + + aggregation: block diagonal: interleaved blocksize int @@ -826,11 +844,10 @@ aggregation: coarsening order int 0 - The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory. + The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory. There, possible values are 0 for piece-wise constant and 1 for piece-wise linear interpolation to transfer values from coarse points to fine points. parameter not existing in ML - aggregation: pairwise: size int @@ -1177,15 +1194,6 @@ Only used when tentative: calculate qr is set to false. not supported by ML - - interp: interpolation order - int - 1 - Interpolation order used to interpolate values from coarse points to fine points. Possible values are 0 for piece-wise constant interpolation and 1 for piece-wise linear interpolation. This parameter is set to 1 by default. - true - parameter not existing in ML - - interp: build coarse coordinates bool diff --git a/packages/muelu/doc/UsersGuide/options_aggregation.tex b/packages/muelu/doc/UsersGuide/options_aggregation.tex index aac7f46e4474..0f2c5c2a6e34 100644 --- a/packages/muelu/doc/UsersGuide/options_aggregation.tex +++ b/packages/muelu/doc/UsersGuide/options_aggregation.tex @@ -62,5 +62,5 @@ \cbb{aggregation: number of spatial dimensions}{int}{3}{The number of spatial dimensions in the problem.} -\cbb{aggregation: coarsening order}{int}{0}{The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory.} +\cbb{aggregation: coarsening order}{int}{0}{The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory. There, possible values are 0 for piece-wise constant and 1 for piece-wise linear interpolation to transfer values from coarse points to fine points. } \ No newline at end of file diff --git a/packages/muelu/doc/UsersGuide/options_multigrid.tex b/packages/muelu/doc/UsersGuide/options_multigrid.tex index 9b6382f689bf..fdec2ab25d34 100644 --- a/packages/muelu/doc/UsersGuide/options_multigrid.tex +++ b/packages/muelu/doc/UsersGuide/options_multigrid.tex @@ -9,8 +9,6 @@ \cbb{sa: use filtered matrix}{bool}{true}{Matrix to use for smoothing the tentative prolongator. The two options are: to use the original matrix, and to use the filtered matrix with filtering based on filtered graph used for aggregation.} -\cbb{interp: interpolation order}{int}{1}{Interpolation order used to interpolate values from coarse points to fine points. Possible values are 0 for piece-wise constant interpolation and 1 for piece-wise linear interpolation. This parameter is set to 1 by default.} - \cbb{interp: build coarse coordinates}{bool}{true}{If false, skip the calculation of coarse coordinates.} \cbb{filtered matrix: use lumping}{bool}{true}{Lump (add to diagonal) dropped entries during the construction of a filtered matrix. This allows user to preserve constant nullspace.} diff --git a/packages/muelu/doc/UsersGuide/paramlist.tex b/packages/muelu/doc/UsersGuide/paramlist.tex index 31c1a9baaed3..782a7615a83a 100644 --- a/packages/muelu/doc/UsersGuide/paramlist.tex +++ b/packages/muelu/doc/UsersGuide/paramlist.tex @@ -115,7 +115,7 @@ \cbb{aggregation: number of spatial dimensions}{int}{3}{The number of spatial dimensions in the problem.} -\cbb{aggregation: coarsening order}{int}{0}{The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory.} +\cbb{aggregation: coarsening order}{int}{0}{The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory. There, possible values are 0 for piece-wise constant and 1 for piece-wise linear interpolation to transfer values from coarse points to fine points. } \cbb{aggregate qualities: check symmetry}{bool}{false}{Whether to check symmetry and use nonsymmetric aggregate quality estimate if necessary.} @@ -168,8 +168,6 @@ \cbb{sa: use filtered matrix}{bool}{true}{Matrix to use for smoothing the tentative prolongator. The two options are: to use the original matrix, and to use the filtered matrix with filtering based on filtered graph used for aggregation.} -\cbb{interp: interpolation order}{int}{1}{Interpolation order used to interpolate values from coarse points to fine points. Possible values are 0 for piece-wise constant interpolation and 1 for piece-wise linear interpolation. This parameter is set to 1 by default.} - \cbb{interp: build coarse coordinates}{bool}{true}{If false, skip the calculation of coarse coordinates.} \cbb{filtered matrix: use lumping}{bool}{true}{Lump (add to diagonal) dropped entries during the construction of a filtered matrix. This allows user to preserve constant nullspace.} diff --git a/packages/muelu/doc/UsersGuide/paramlist_hidden.tex b/packages/muelu/doc/UsersGuide/paramlist_hidden.tex index 4a85d8e27376..5b5c5c8be0bb 100644 --- a/packages/muelu/doc/UsersGuide/paramlist_hidden.tex +++ b/packages/muelu/doc/UsersGuide/paramlist_hidden.tex @@ -70,6 +70,10 @@ aggregation. Possible values: "classical", "distance laplacian", "unsupported vector smoothing"} +\cbb{aggregation: classical scheme}{string}{"direct"}{Prolongator formation option for classical coarsening.} + +\cbb{aggregation: row sum drop tol}{double}{-1.0}{Detection threshold for mass-dominated rows. Defaults to -1 (meaning disabled)} + \cbb{aggregation: block diagonal: interleaved blocksize}{int}{3}{Effective block size to use for block-diagonalization. This assumes the PDE is interleaved} @@ -173,7 +177,7 @@ \cbb{aggregation: number of spatial dimensions}{int}{3}{The number of spatial dimensions in the problem.} -\cbb{aggregation: coarsening order}{int}{0}{The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory.} +\cbb{aggregation: coarsening order}{int}{0}{The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory. There, possible values are 0 for piece-wise constant and 1 for piece-wise linear interpolation to transfer values from coarse points to fine points. } \cbb{aggregation: pairwise: size}{int}{8}{Target size for pairwise aggregation. The number of pairwise steps used will be log base-2 of this number.} @@ -260,8 +264,6 @@ \cbb{sa: rowsumabs diagonal replacement value}{double}{0.0}{If it's determined that a diagonal entry in prolongator smoothing is too small, replace that entry with this value.} -\cbb{interp: interpolation order}{int}{1}{Interpolation order used to interpolate values from coarse points to fine points. Possible values are 0 for piece-wise constant interpolation and 1 for piece-wise linear interpolation. This parameter is set to 1 by default.} - \cbb{interp: build coarse coordinates}{bool}{true}{If false, skip the calculation of coarse coordinates.} \cba{transfer: params}{\parameterlist}{Sublist of options for use by transfer.} diff --git a/packages/muelu/research/regionMG/example/elasticity_3d.xml b/packages/muelu/research/regionMG/example/elasticity_3d.xml index 8f4323366bef..2cea9bc61167 100644 --- a/packages/muelu/research/regionMG/example/elasticity_3d.xml +++ b/packages/muelu/research/regionMG/example/elasticity_3d.xml @@ -39,7 +39,7 @@ - + diff --git a/packages/muelu/research/regionMG/example/poisson_3d.xml b/packages/muelu/research/regionMG/example/poisson_3d.xml index 2d552bd84b62..8bf2429c8350 100644 --- a/packages/muelu/research/regionMG/example/poisson_3d.xml +++ b/packages/muelu/research/regionMG/example/poisson_3d.xml @@ -37,7 +37,7 @@ - + diff --git a/packages/muelu/research/regionMG/test/structured/structured_1dof-complex.xml b/packages/muelu/research/regionMG/test/structured/structured_1dof-complex.xml index 02a6e9ef291d..dbc1d0b713ea 100644 --- a/packages/muelu/research/regionMG/test/structured/structured_1dof-complex.xml +++ b/packages/muelu/research/regionMG/test/structured/structured_1dof-complex.xml @@ -37,7 +37,7 @@ - + diff --git a/packages/muelu/research/regionMG/test/structured/structured_1dof.xml b/packages/muelu/research/regionMG/test/structured/structured_1dof.xml index 36d028fd48a4..8d36dec5bbfd 100644 --- a/packages/muelu/research/regionMG/test/structured/structured_1dof.xml +++ b/packages/muelu/research/regionMG/test/structured/structured_1dof.xml @@ -37,7 +37,7 @@ - + diff --git a/packages/muelu/research/regionMG/test/structured/structured_1dof_3level.xml b/packages/muelu/research/regionMG/test/structured/structured_1dof_3level.xml index d8e25a82f5cb..ce720e370686 100644 --- a/packages/muelu/research/regionMG/test/structured/structured_1dof_3level.xml +++ b/packages/muelu/research/regionMG/test/structured/structured_1dof_3level.xml @@ -37,7 +37,7 @@ - + diff --git a/packages/muelu/research/regionMG/test/structured/structured_linear_1dof.xml b/packages/muelu/research/regionMG/test/structured/structured_linear_1dof.xml index 1dacffa64316..ff6e61c0f0ee 100644 --- a/packages/muelu/research/regionMG/test/structured/structured_linear_1dof.xml +++ b/packages/muelu/research/regionMG/test/structured/structured_linear_1dof.xml @@ -37,7 +37,7 @@ - + diff --git a/packages/muelu/research/regionMG/test/structured/structured_linear_1dof_comp.xml b/packages/muelu/research/regionMG/test/structured/structured_linear_1dof_comp.xml index 57a77564b2e7..b83bcabee52f 100644 --- a/packages/muelu/research/regionMG/test/structured/structured_linear_1dof_comp.xml +++ b/packages/muelu/research/regionMG/test/structured/structured_linear_1dof_comp.xml @@ -37,7 +37,7 @@ - + diff --git a/packages/muelu/research/regionMG/test/structured/structured_linear_3dof.xml b/packages/muelu/research/regionMG/test/structured/structured_linear_3dof.xml index 48f6d321d601..dc58deed4634 100644 --- a/packages/muelu/research/regionMG/test/structured/structured_linear_3dof.xml +++ b/packages/muelu/research/regionMG/test/structured/structured_linear_3dof.xml @@ -37,7 +37,7 @@ - + diff --git a/packages/muelu/src/CMakeLists.txt b/packages/muelu/src/CMakeLists.txt index 359b0ad5b9eb..21cd0c294c10 100644 --- a/packages/muelu/src/CMakeLists.txt +++ b/packages/muelu/src/CMakeLists.txt @@ -44,6 +44,7 @@ INCLUDE_DIRECTORIES(${DIR}/Transfers/BaseClass) INCLUDE_DIRECTORIES(${DIR}/Transfers/BlackBox) INCLUDE_DIRECTORIES(${DIR}/Smoothers/BlockedSmoothers) INCLUDE_DIRECTORIES(${DIR}/Transfers/BlockedTransfers) +INCLUDE_DIRECTORIES(${DIR}/Transfers/Classical) INCLUDE_DIRECTORIES(${DIR}/Transfers/Energy-Minimization) INCLUDE_DIRECTORIES(${DIR}/Transfers/Energy-Minimization/Solvers) INCLUDE_DIRECTORIES(${DIR}/Transfers/GeneralGeometric) @@ -387,6 +388,7 @@ TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Smoothers/BlockedSmoothers NOSIER TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/BaseClass NOSIERRABJAM) TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/BlackBox NOSIERRABJAM) TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/BlockedTransfers NOSIERRABJAM) +TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/Classical NOSIERRABJAM) TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/Generic NOSIERRABJAM) TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/Energy-Minimization NOSIERRABJAM) TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/Energy-Minimization/Solvers NOSIERRABJAM) @@ -458,3 +460,6 @@ TRIBITS_ADD_LIBRARY( # touch CMakeLists.txt because a new file was created in Utils/ExplicitInstantiation of Utils/ForwardDeclaration # touch CMakeLists.txt because a new file was created in Utils/ExplicitInstantiation of Utils/ForwardDeclaration # touch CMakeLists.txt because a new file was created in Utils/ExplicitInstantiation of Utils/ForwardDeclaration +# touch CMakeLists.txt because a new file was created in Utils/ExplicitInstantiation of Utils/ForwardDeclaration +# touch CMakeLists.txt because a new file was created in Utils/ExplicitInstantiation of Utils/ForwardDeclaration +# touch CMakeLists.txt because a new file was created in Utils/ExplicitInstantiation of Utils/ForwardDeclaration diff --git a/packages/muelu/src/Graph/Containers/MueLu_GraphBase.hpp b/packages/muelu/src/Graph/Containers/MueLu_GraphBase.hpp index 1193faa8e4c8..425cda7da535 100644 --- a/packages/muelu/src/Graph/Containers/MueLu_GraphBase.hpp +++ b/packages/muelu/src/Graph/Containers/MueLu_GraphBase.hpp @@ -70,6 +70,10 @@ namespace MueLu { #include "MueLu_UseShortNamesOrdinal.hpp" public: + // For Zoltan2 compatibility + using lno_t = LocalOrdinal; + using gno_t = GlobalOrdinal; + using node_t = Node; //! @name Constructors/Destructors. //@{ diff --git a/packages/muelu/src/Graph/Containers/MueLu_Graph_decl.hpp b/packages/muelu/src/Graph/Containers/MueLu_Graph_decl.hpp index ab9999fefaef..09609adcf60a 100644 --- a/packages/muelu/src/Graph/Containers/MueLu_Graph_decl.hpp +++ b/packages/muelu/src/Graph/Containers/MueLu_Graph_decl.hpp @@ -94,6 +94,8 @@ namespace MueLu { //! Returns overlapping import map (nodes). const RCP GetImportMap() const { return graph_->getColMap(); } + const RCP GetGraph() const {return graph_;} + //! Set map with local ids of boundary nodes. void SetBoundaryNodeMap(const ArrayRCP& localDirichletNodes) { localDirichletNodes_ = localDirichletNodes; } diff --git a/packages/muelu/src/Graph/Containers/MueLu_LWGraph_decl.hpp b/packages/muelu/src/Graph/Containers/MueLu_LWGraph_decl.hpp index 64706667ba53..09f55a70193e 100644 --- a/packages/muelu/src/Graph/Containers/MueLu_LWGraph_decl.hpp +++ b/packages/muelu/src/Graph/Containers/MueLu_LWGraph_decl.hpp @@ -139,7 +139,16 @@ namespace MueLu { /// Return a simple one-line description of the Graph. std::string description() const { return "MueLu.description()"; } //FIXME use object's label + //! Return the row pointers of the local graph + const ArrayRCP getRowPtrs() const { + return rows_; + } + //! Return the list entries in the local graph + const ArrayRCP getEntries() const { + return columns_; + } + //! Print the Graph with some verbosity level to an FancyOStream object. //using MueLu::Describable::describe; // overloading, not hiding //void describe(Teuchos::FancyOStream &out, const VerbLevel verbLevel = Default) const;; diff --git a/packages/muelu/src/Graph/Containers/MueLu_Zoltan2GraphAdapter.hpp b/packages/muelu/src/Graph/Containers/MueLu_Zoltan2GraphAdapter.hpp new file mode 100644 index 000000000000..7014efcc3281 --- /dev/null +++ b/packages/muelu/src/Graph/Containers/MueLu_Zoltan2GraphAdapter.hpp @@ -0,0 +1,459 @@ +// @HEADER +// +// *********************************************************************** +// +// MueLu: A package for multigrid based preconditioning +// Copyright 2012 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact +// Jonathan Hu (jhu@sandia.gov) +// Andrey Prokopenko (aprokop@sandia.gov) +// Ray Tuminaro (rstumin@sandia.gov) +// +// *********************************************************************** +// +// @HEADER + +#ifndef MUELU_ZOLTAN2GRAPHADAPTER_HPP_ +#define MUELU_ZOLTAN2GRAPHADAPTER_HPP_ + +#include "MueLu_ConfigDefs.hpp" + +#if defined(HAVE_MUELU_ZOLTAN2) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "MueLu_GraphBase.hpp" + + + +// Zoltab2 InputTraits for MueLu Graph objects +namespace Zoltan2 { + +template +struct InputTraits > +{ + typedef Zoltan2::default_scalar_t scalar_t; + typedef LocalOrdinal lno_t; + typedef GlobalOrdinal gno_t; + typedef size_t offset_t; + typedef Zoltan2::default_part_t part_t; + typedef Node node_t; + static inline std::string name() {return "MueLu::Graph";} + + Z2_STATIC_ASSERT_TYPES // validate the types +}; +}//end namespace Zoltan2 + + +namespace MueLu { + +template +class MueLuGraphBaseAdapter : public Zoltan2::GraphAdapter { +public: + +#ifndef DOXYGEN_SHOULD_SKIP_THIS + typedef typename Zoltan2::InputTraits::scalar_t scalar_t; + typedef typename Zoltan2::InputTraits::offset_t offset_t; + typedef typename Zoltan2::InputTraits::lno_t lno_t; + typedef typename Zoltan2::InputTraits::gno_t gno_t; + typedef typename Zoltan2::InputTraits::part_t part_t; + typedef typename Zoltan2::InputTraits::node_t node_t; + typedef User xgraph_t; + typedef User user_t; + typedef UserCoord userCoord_t; +#endif + + //! MueLu::GraphBase Compatibility Layer + const Teuchos::RCP< const Teuchos::Comm< int > > getComm() const { return graph_->GetComm();} + const Teuchos::RCP< const Xpetra::Map > getRowMap() const { return graph_->GetDomainMap();} + const RCP< const Xpetra::Map > getColMap() const { + // For some GraphBases' this is a ColMap, in others it is a seperate map that is + // only non-null in parallel. + Teuchos::RCP > map = graph_->GetImportMap(); + if(map.is_null()) map = graph_->GetDomainMap(); + return map; + } + size_t getNodeNumEntries() const { return graph_->GetNodeNumEdges();} + size_t getNodeNumRows() const { return getRowMap()->getNodeNumElements();} + size_t getNodeNumCols() const { return getColMap()->getNodeNumElements();} + + void getLocalRowView(lno_t LocalRow, Teuchos::ArrayView< const lno_t > &indices) const { + indices = graph_->getNeighborVertices(LocalRow); + } + + + + /*! \brief Destructor + */ + ~MueLuGraphBaseAdapter() { } + + /*! \brief Constructor for graph with no weights or coordinates. + * \param ingraph the Epetra_CrsGraph, Tpetra::CrsGraph or Xpetra::CrsGraph + * \param numVtxWeights the number of weights per vertex (default = 0) + * \param numEdgeWeights the number of weights per edge (default = 0) + * + * Most adapters do not have RCPs in their interface. This + * one does because the user is obviously a Trilinos user. + */ + + MueLuGraphBaseAdapter(const RCP &ingraph, + int nVtxWeights=0, int nEdgeWeights=0); + + /*! \brief Provide a pointer to weights for the primary entity type. + * \param val A pointer to the weights for index \c idx. + * \param stride A stride for the \c val array. If \stride is + * \c k, then val[n * k] is the weight for the + * \c n th entity for index \idx. + * \param idx A number from 0 to one less than + * weight idx specified in the constructor. + * + * The order of the weights should match the order that + * entities appear in the input data structure. + */ + + void setWeights(const scalar_t *val, int stride, int idx); + + /*! \brief Provide a pointer to vertex weights. + * \param val A pointer to the weights for index \c idx. + * \param stride A stride for the \c val array. If \stride is + * \c k, then val[n * k] is the weight for the + * \c n th vertex for index \idx. + * \param idx A number from 0 to one less than + * number of vertex weights specified in the constructor. + * + * The order of the vertex weights should match the order that + * vertices appear in the input data structure. + * \code + * TheGraph->getRowMap()->getNodeElementList() + * \endcode + */ + + void setVertexWeights(const scalar_t *val, int stride, int idx); + + /*! \brief Specify an index for which the weight should be + the degree of the entity + * \param idx Zoltan2 will use the entity's + * degree as the entity weight for index \c idx. + */ + void setWeightIsDegree(int idx); + + /*! \brief Specify an index for which the vertex weight should be + the degree of the vertex + * \param idx Zoltan2 will use the vertex's + * degree as the vertex weight for index \c idx. + */ + void setVertexWeightIsDegree(int idx); + + /*! \brief Provide a pointer to edge weights. + * \param val A pointer to the weights for index \c idx. + * \param stride A stride for the \c val array. If \stride is + * \c k, then val[n * k] is the weight for the + * \c n th edge for index \idx. + * \param dim A number from 0 to one less than the number + * of edge weights specified in the constructor. + * + * The order of the edge weights should follow the order that the + * the vertices and edges appear in the input data structure. + * + * By vertex: + * \code + * TheGraph->getRowMap()->getNodeElementList() + * \endcode + * + * Then by vertex neighbor: + * \code + * TheGraph->getLocalRowView(vertexNum, neighborList); + * \endcode + */ + + void setEdgeWeights(const scalar_t *val, int stride, int idx); + + /*! \brief Access to Xpetra-wrapped user's graph. + */ + RCP getXpetraGraph() const { return graph_; } + + /*! \brief Access to user's graph + */ + RCP getUserGraph() const { return ingraph_; } + + //////////////////////////////////////////////////// + // The Adapter interface. + //////////////////////////////////////////////////// + + //////////////////////////////////////////////////// + // The GraphAdapter interface. + //////////////////////////////////////////////////// + + // TODO: Assuming rows == objects; + // TODO: Need to add option for columns or nonzeros? + size_t getLocalNumVertices() const { return getNodeNumRows(); } + + void getVertexIDsView(const gno_t *&ids) const + { + ids = NULL; + if (getLocalNumVertices()) + ids = getRowMap()->getNodeElementList().getRawPtr(); + } + + size_t getLocalNumEdges() const { return getNodeNumEntries(); } + + void getEdgesView(const offset_t *&offsets, const gno_t *&adjIds) const + { + offsets = offs_.getRawPtr(); + adjIds = (getLocalNumEdges() ? adjids_.getRawPtr() : NULL); + } + + int getNumWeightsPerVertex() const { return nWeightsPerVertex_;} + + void getVertexWeightsView(const scalar_t *&weights, int &stride, + int idx) const + { + if(idx<0 || idx >= nWeightsPerVertex_) + { + std::ostringstream emsg; + emsg << __FILE__ << ":" << __LINE__ + << " Invalid vertex weight index " << idx << std::endl; + throw std::runtime_error(emsg.str()); + } + + + size_t length; + vertexWeights_[idx].getStridedList(length, weights, stride); + } + + bool useDegreeAsVertexWeight(int idx) const {return vertexDegreeWeight_[idx];} + + int getNumWeightsPerEdge() const { return nWeightsPerEdge_;} + + void getEdgeWeightsView(const scalar_t *&weights, int &stride, int idx) const + { + if(idx<0 || idx >= nWeightsPerEdge_) + { + std::ostringstream emsg; + emsg << __FILE__ << ":" << __LINE__ + << " Invalid edge weight index " << idx << std::endl; + throw std::runtime_error(emsg.str()); + } + + + size_t length; + edgeWeights_[idx].getStridedList(length, weights, stride); + } + + + template + void applyPartitioningSolution(const User &in, User *&out, + const Zoltan2::PartitioningSolution &solution) const { + TEUCHOS_TEST_FOR_EXCEPTION(1, std::invalid_argument,"applyPartitionlingSolution not implemeneted"); +} + + template + void applyPartitioningSolution(const User &in, RCP &out, + const Zoltan2::PartitioningSolution &solution) const { + TEUCHOS_TEST_FOR_EXCEPTION(1, std::invalid_argument,"applyPartitionlingSolution not implemeneted"); + } + + +private: + + RCP ingraph_; + RCP graph_; + RCP > comm_; + + ArrayRCP offs_; + ArrayRCP adjids_; + + int nWeightsPerVertex_; + ArrayRCP > vertexWeights_; + ArrayRCP vertexDegreeWeight_; + + int nWeightsPerEdge_; + ArrayRCP > edgeWeights_; + + int coordinateDim_; + ArrayRCP > coords_; + +}; + + +///////////////////////////////////////////////////////////////// +// Definitions +///////////////////////////////////////////////////////////////// + +template + MueLuGraphBaseAdapter::MueLuGraphBaseAdapter( + const RCP &ingraph, int nVtxWgts, int nEdgeWgts): + ingraph_(ingraph), graph_(), comm_() , offs_(), adjids_(), + nWeightsPerVertex_(nVtxWgts), vertexWeights_(), vertexDegreeWeight_(), + nWeightsPerEdge_(nEdgeWgts), edgeWeights_(), + coordinateDim_(0), coords_() +{ + typedef Zoltan2::StridedData input_t; + graph_ = ingraph; + + comm_ = getRowMap()->getComm(); + size_t nvtx = getNodeNumRows(); + size_t nedges = getNodeNumEntries(); + + // Unfortunately we have to copy the offsets and edge Ids + // because edge Ids are not usually stored in vertex id order. + size_t n = nvtx + 1; + offs_.resize(n); + offset_t* offs = const_cast(offs_.getRawPtr()); + gno_t* adjids=0; + if(nedges > 0) { + adjids_.resize(nedges); + adjids = const_cast(adjids_.getRawPtr()); + } + + offs[0] = 0; + for (size_t v=0; v < nvtx; v++){ + ArrayView nbors; + getLocalRowView(v, nbors); + offs[v+1] = offs[v] + nbors.size(); + for (offset_t e=offs[v], i=0; e < offs[v+1]; e++) { + adjids[e] = getColMap()->getGlobalElement(nbors[i++]); + } + } + + if (nWeightsPerVertex_ > 0) { + vertexWeights_ = + arcp(new input_t[nWeightsPerVertex_], 0, nWeightsPerVertex_, true); + vertexDegreeWeight_ = + arcp(new bool[nWeightsPerVertex_], 0, nWeightsPerVertex_, true); + for (int i=0; i < nWeightsPerVertex_; i++) + vertexDegreeWeight_[i] = false; + } + + +} + +//////////////////////////////////////////////////////////////////////////// +template + void MueLuGraphBaseAdapter::setWeights( + const scalar_t *weightVal, int stride, int idx) +{ + if (this->getPrimaryEntityType() == Zoltan2::GRAPH_VERTEX) + setVertexWeights(weightVal, stride, idx); + else + setEdgeWeights(weightVal, stride, idx); +} + +//////////////////////////////////////////////////////////////////////////// +template + void MueLuGraphBaseAdapter::setVertexWeights( + const scalar_t *weightVal, int stride, int idx) +{ + typedef Zoltan2::StridedData input_t; + + if(idx<0 || idx >= nWeightsPerVertex_) + { + std::ostringstream emsg; + emsg << __FILE__ << ":" << __LINE__ + << " Invalid vertex weight index " << idx << std::endl; + throw std::runtime_error(emsg.str()); + } + + size_t nvtx = getLocalNumVertices(); + ArrayRCP weightV(weightVal, 0, nvtx*stride, false); + vertexWeights_[idx] = input_t(weightV, stride); +} + +//////////////////////////////////////////////////////////////////////////// +template + void MueLuGraphBaseAdapter::setWeightIsDegree( + int idx) +{ + if (this->getPrimaryEntityType() == Zoltan2::GRAPH_VERTEX) + setVertexWeightIsDegree(idx); + else { + std::ostringstream emsg; + emsg << __FILE__ << "," << __LINE__ + << " error: setWeightIsNumberOfNonZeros is supported only for" + << " vertices" << std::endl; + throw std::runtime_error(emsg.str()); + } +} + +//////////////////////////////////////////////////////////////////////////// +template + void MueLuGraphBaseAdapter::setVertexWeightIsDegree( + int idx) +{ + if(idx<0 || idx >= nWeightsPerVertex_) + { + std::ostringstream emsg; + emsg << __FILE__ << ":" << __LINE__ + << " Invalid vertex weight index " << idx << std::endl; + throw std::runtime_error(emsg.str()); + } + + vertexDegreeWeight_[idx] = true; +} + +//////////////////////////////////////////////////////////////////////////// +template + void MueLuGraphBaseAdapter::setEdgeWeights( + const scalar_t *weightVal, int stride, int idx) +{ + typedef Zoltan2::StridedData input_t; + + if(idx<0 || idx >= nWeightsPerEdge_) + { + std::ostringstream emsg; + emsg << __FILE__ << ":" << __LINE__ + << " Invalid edge weight index " << idx << std::endl; + throw std::runtime_error(emsg.str()); + } + + size_t nedges = getLocalNumEdges(); + ArrayRCP weightV(weightVal, 0, nedges*stride, false); + edgeWeights_[idx] = input_t(weightV, stride); +} + + +} //namespace MueLu + + +#endif// MUELU_HAVE_ZOLTAN2 + +#endif diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index 937601632abd..4a2d2ca2e38f 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -118,13 +118,14 @@ namespace MueLu { #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name)) SET_VALID_ENTRY("aggregation: drop tol"); SET_VALID_ENTRY("aggregation: Dirichlet threshold"); + SET_VALID_ENTRY("aggregation: row sum drop tol"); SET_VALID_ENTRY("aggregation: drop scheme"); SET_VALID_ENTRY("aggregation: block diagonal: interleaved blocksize"); SET_VALID_ENTRY("aggregation: distance laplacian directional weights"); { typedef Teuchos::StringToIntegralParameterEntryValidator validatorType; - validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new validatorType(Teuchos::tuple("classical", "distance laplacian","block diagonal","block diagonal classical","block diagonal distance laplacian"), "aggregation: drop scheme"))); + validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new validatorType(Teuchos::tuple("classical", "distance laplacian","signed classical","block diagonal","block diagonal classical","block diagonal distance laplacian","block diagonal signed classical"), "aggregation: drop scheme"))); } SET_VALID_ENTRY("aggregation: distance laplacian algo"); @@ -135,7 +136,7 @@ namespace MueLu { validParamList->set< RCP >("A", Teuchos::null, "Generating factory of the matrix A"); validParamList->set< RCP >("UnAmalgamationInfo", Teuchos::null, "Generating factory for UnAmalgamationInfo"); validParamList->set< RCP >("Coordinates", Teuchos::null, "Generating factory for Coordinates"); - validParamList->set< RCP >("BlockNumber", Teuchos::null, "Generating factory for Coordinates"); + validParamList->set< RCP >("BlockNumber", Teuchos::null, "Generating factory for BlockNUmber"); return validParamList; } @@ -154,7 +155,8 @@ namespace MueLu { if (algo == "distance laplacian" || algo == "block diagonal distance laplacian") { Input(currentLevel, "Coordinates"); } - if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian" || algo == "block diagonal") { + if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian" + || algo == "block diagonal" || algo == "block diagonal signed classical") { Input(currentLevel, "BlockNumber"); } } @@ -188,22 +190,32 @@ namespace MueLu { bool use_block_algorithm=false; LO interleaved_blocksize = as(pL.get("aggregation: block diagonal: interleaved blocksize")); + bool useSignedClassical = false; + + // NOTE: If we're doing blockDiagonal, we'll not want to do rowSum twice (we'll do it + // in the block diagonalizaiton). So we'll clobber the rowSumTol with -1.0 in this case + typename STS::magnitudeType rowSumTol = as(pL.get("aggregation: row sum drop tol")); + if(algo == "distance laplacian" ) { // Grab the coordinates for distance laplacian Coords = Get< RCP >(currentLevel, "Coordinates"); A = realA; } + else if(algo == "signed classical") { + useSignedClassical = true; + algo = "classical"; + A = realA; + } else if(algo == "block diagonal") { // Handle the "block diagonal" filtering and then leave BlockDiagonalize(currentLevel,realA,false); return; } - else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian") { + else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian" || algo == "block diagonal signed classical") { // Handle the "block diagonal" filtering, and then continue onward use_block_algorithm = true; RCP filteredMatrix = BlockDiagonalize(currentLevel,realA,true); - if(algo == "block diagonal") return; - else if(algo == "block diagonal distance laplacian") { + if(algo == "block diagonal distance laplacian") { // We now need to expand the coordinates by the interleaved blocksize RCP OldCoords = Get< RCP >(currentLevel, "Coordinates"); if (OldCoords->getLocalLength() != realA->getNodeNumRows()) { @@ -227,8 +239,13 @@ namespace MueLu { else if(algo == "block diagonal classical") { algo = "classical"; } - // Both cases + else if(algo == "block diagonal signed classical") { + algo = "classical"; + useSignedClassical = true; + } + // All cases A = filteredMatrix; + rowSumTol = -1.0; } else { A = realA; @@ -275,12 +292,13 @@ namespace MueLu { if (doExperimentalWrap) { TEUCHOS_TEST_FOR_EXCEPTION(predrop_ != null && algo != "classical", Exceptions::RuntimeError, "Dropping function must not be provided for \"" << algo << "\" algorithm"); - TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian)"); + TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian" && algo != "signed classical", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian|signed classical)"); SC threshold = as(pL.get("aggregation: drop tol")); std::string distanceLaplacianAlgoStr = pL.get("aggregation: distance laplacian algo"); std::string classicalAlgoStr = pL.get("aggregation: classical algo"); real_type realThreshold = STS::magnitude(threshold);// CMS: Rename this to "magnitude threshold" sometime + //////////////////////////////////////////////////// // Remove this bit once we are confident that cut-based dropping works. #ifdef HAVE_MUELU_DEBUG @@ -337,6 +355,11 @@ namespace MueLu { const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as(pL.get("aggregation: Dirichlet threshold"))); + + // NOTE: We don't support signed classical with cut drop at present + TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassical && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical aggregation"); + + GO numDropped = 0, numTotal = 0; std::string graphType = "unamalgamated"; //for description purposes only if (algo == "classical") { @@ -359,12 +382,14 @@ namespace MueLu { // At this points we either have // (predrop_ != null) // Therefore, it is sufficient to check only threshold - if (A->GetFixedBlockSize() == 1 && threshold == STS::zero() && A->hasCrsGraph()) { + if (A->GetFixedBlockSize() == 1 && threshold == STS::zero() && !useSignedClassical && A->hasCrsGraph()) { // Case 1: scalar problem, no dropping => just use matrix graph RCP graph = rcp(new Graph(A->getCrsGraph(), "graph of A")); // Detect and record rows that correspond to Dirichlet boundary conditions - ArrayRCP boundaryNodes; - boundaryNodes = MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold); + ArrayRCP boundaryNodes = Teuchos::arcp_const_cast(MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold)); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterion(*A, rowSumTol, boundaryNodes); + graph->SetBoundaryNodeMap(boundaryNodes); numTotal = A->getNodeNumEntries(); @@ -383,7 +408,8 @@ namespace MueLu { Set(currentLevel, "Graph", graph); } else if ( (A->GetFixedBlockSize() == 1 && threshold != STS::zero()) || - (A->GetFixedBlockSize() == 1 && threshold == STS::zero() && !A->hasCrsGraph())) { + (A->GetFixedBlockSize() == 1 && threshold == STS::zero() && !A->hasCrsGraph()) || + (A->GetFixedBlockSize() == 1 && useSignedClassical) ) { // Case 2: scalar problem with dropping => record the column indices of undropped entries, but still use original // graph's map information, e.g., whether index is local // OR a matrix without a CrsGraph @@ -392,9 +418,20 @@ namespace MueLu { ArrayRCP rows (A->getNodeNumRows()+1); ArrayRCP columns(A->getNodeNumEntries()); - RCP ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); - const ArrayRCP ghostedDiagVals = ghostedDiag->getData(0); - ArrayRCP boundaryNodes = MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold); + using MT = typename STS::magnitudeType; + RCP ghostedDiag; + ArrayRCP ghostedDiagVals; + ArrayRCP negMaxOffDiagonal; + if(useSignedClassical) { + negMaxOffDiagonal = MueLu::Utilities::GetMatrixMaxMinusOffDiagonal(*A); + } + else { + ghostedDiag = MueLu::Utilities::GetMatrixOverlappedDiagonal(*A); + ghostedDiagVals = ghostedDiag->getData(0); + } + ArrayRCP boundaryNodes = Teuchos::arcp_const_cast(MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold)); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterion(*A, rowSumTol, boundaryNodes); LO realnnz = 0; rows[0] = 0; @@ -411,20 +448,36 @@ namespace MueLu { //FIXME For now, hardwiring the dropping in here LO rownnz = 0; - for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { - LO col = indices[colID]; - - // we avoid a square root by using squared values - typename STS::magnitudeType aiiajj = STS::magnitude(threshold*threshold * ghostedDiagVals[col]*ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| - typename STS::magnitudeType aij = STS::magnitude(vals[colID]*vals[colID]); // |a_ij|^2 - - if (aij > aiiajj || row == col) { + if(useSignedClassical) { + // Signed classical + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]); + MT neg_aij = - STS::real(vals[colID]); + //printf(" - a_ij = %6.4e >? %6.4e * %6.4e = alpha max(-aik)\n",neg_aij,threshold, negMaxOffDiagonal[row]); + if (neg_aij > max_neg_aik || row == col) { + columns[realnnz++] = col; + rownnz++; + } else + numDropped++; + } + rows[row+1] = realnnz; + } + else { + // Standard abs classical + for (LO colID = 0; colID < Teuchos::as(nnz); colID++) { + LO col = indices[colID]; + MT aiiajj = STS::magnitude(threshold*threshold * ghostedDiagVals[col]*ghostedDiagVals[row]); // eps^2*|a_ii|*|a_jj| + MT aij = STS::magnitude(vals[colID]*vals[colID]); // |a_ij|^2 + + if (aij > aiiajj || row == col) { columns[realnnz++] = col; rownnz++; - } else - numDropped++; + } else + numDropped++; + } + rows[row+1] = realnnz; } - rows[row+1] = realnnz; } else { /* Cut Algorithm */ @@ -581,8 +634,11 @@ namespace MueLu { // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size // TODO the array one bigger than the number of local rows, and the last entry can // TODO hold the actual number of boundary nodes. Clever, huh? - ArrayRCP pointBoundaryNodes; - pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold); + ArrayRCP pointBoundaryNodes; + pointBoundaryNodes = Teuchos::arcp_const_cast(MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold)); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes); + // extract striding information LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) @@ -676,7 +732,6 @@ namespace MueLu { // Case 4: Multiple DOF/node problem with dropping const RCP rowMap = A->getRowMap(); const RCP colMap = A->getColMap(); - graphType = "amalgamated"; // build node row map (uniqueMap) and node column map (nonUniqueMap) @@ -701,8 +756,11 @@ namespace MueLu { // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size // TODO the array one bigger than the number of local rows, and the last entry can // TODO hold the actual number of boundary nodes. Clever, huh? - ArrayRCP pointBoundaryNodes; - pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold); + ArrayRCP pointBoundaryNodes; + pointBoundaryNodes = Teuchos::arcp_const_cast(MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold)); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes); + // extract striding information LO blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map) @@ -800,7 +858,6 @@ namespace MueLu { } else if (algo == "distance laplacian") { LO blkSize = A->GetFixedBlockSize(); GO indexBase = A->getRowMap()->getIndexBase(); - // [*0*] : FIXME // ap: somehow, if I move this line to [*1*], Belos throws an error // I'm not sure what's going on. Do we always have to Get data, if we did @@ -811,8 +868,10 @@ namespace MueLu { // TODO If we use ArrayRCP, then we can record boundary nodes as usual. Size // TODO the array one bigger than the number of local rows, and the last entry can // TODO hold the actual number of boundary nodes. Clever, huh? - ArrayRCP pointBoundaryNodes; - pointBoundaryNodes = MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold); + ArrayRCP pointBoundaryNodes; + pointBoundaryNodes = Teuchos::arcp_const_cast(MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold)); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes); if ( (blkSize == 1) && (threshold == STS::zero()) ) { // Trivial case: scalar problem, no dropping. Can return original graph @@ -1558,6 +1617,7 @@ namespace MueLu { const ParameterList & pL = GetParameterList(); const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as(pL.get("aggregation: Dirichlet threshold"))); + const typename STS::magnitudeType rowSumTol = as(pL.get("aggregation: row sum drop tol")); RCP BlockNumber = Get >(currentLevel, "BlockNumber"); RCP ghostedBlockNumber; @@ -1619,7 +1679,10 @@ namespace MueLu { else rows_graph[row+1] = realnnz; } - ArrayRCP boundaryNodes = MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold); + ArrayRCP boundaryNodes = Teuchos::arcp_const_cast(MueLu::Utilities::DetectDirichletRows(*A, dirichletThreshold)); + if (rowSumTol > 0.) + Utilities::ApplyRowSumCriterion(*A, rowSumTol, boundaryNodes); + if(!generate_matrix) { // We can't resize an Arrayrcp and pass the checks for setAllValues diff --git a/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_decl.hpp b/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_decl.hpp index 926e5df4b554..c43758ee8352 100644 --- a/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_decl.hpp +++ b/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_decl.hpp @@ -56,6 +56,7 @@ #include "MueLu_StructuredAggregationFactory_fwd.hpp" #include "MueLu_Level_fwd.hpp" #include "MueLu_Exceptions.hpp" +#include "MueLu_AggregationStructuredAlgorithm_fwd.hpp" namespace MueLu { diff --git a/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_def.hpp b/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_def.hpp index 658fd6dd9cfb..3c44a42d6067 100644 --- a/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_def.hpp +++ b/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_def.hpp @@ -343,7 +343,7 @@ namespace MueLu { Set(currentLevel, "lCoarseNodesPerDim", geoData->getLocalCoarseNodesPerDir()); Set(currentLevel, "coarseCoordinatesFineMap", coarseCoordinatesFineMap); Set(currentLevel, "coarseCoordinatesMap", coarseCoordinatesMap); - Set(currentLevel, "interpolationOrder", interpolationOrder); + Set(currentLevel, "structuredInterpolationOrder", interpolationOrder); Set(currentLevel, "numDimensions", numDimensions); } // Build() diff --git a/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_kokkos_def.hpp b/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_kokkos_def.hpp index d7398aa40001..318913a92799 100644 --- a/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_kokkos_def.hpp +++ b/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_kokkos_def.hpp @@ -91,7 +91,6 @@ namespace MueLu { "Coarsening rate per spatial dimensions"); validParamList->set ("aggregation: coarsening order", 0, "The interpolation order used to construct grid transfer operators based off these aggregates."); - validParamList->set >("Graph", Teuchos::null, "Graph of the matrix after amalgamation but without dropping."); validParamList->set >("DofsPerNode", Teuchos::null, @@ -248,10 +247,10 @@ namespace MueLu { Set(currentLevel, "prolongatorGraph", myGraph); } - Set(currentLevel, "lCoarseNodesPerDim", geoData->getCoarseNodesPerDirArray()); - Set(currentLevel, "indexManager", geoData); - Set(currentLevel, "interpolationOrder", interpolationOrder); - Set(currentLevel, "numDimensions", numDimensions); + Set(currentLevel, "lCoarseNodesPerDim", geoData->getCoarseNodesPerDirArray()); + Set(currentLevel, "indexManager", geoData); + Set(currentLevel, "structuredInterpolationOrder", interpolationOrder); + Set(currentLevel, "numDimensions", numDimensions); } // Build() diff --git a/packages/muelu/src/Headers/MueLu_UseShortNamesScalar.hpp b/packages/muelu/src/Headers/MueLu_UseShortNamesScalar.hpp index ca10e38a0910..49e7b229e595 100644 --- a/packages/muelu/src/Headers/MueLu_UseShortNamesScalar.hpp +++ b/packages/muelu/src/Headers/MueLu_UseShortNamesScalar.hpp @@ -59,6 +59,12 @@ typedef MueLu::BraessSarazinSmoother Bra #ifdef MUELU_CGSOLVER_SHORT typedef MueLu::CGSolver CGSolver; #endif +#ifdef MUELU_CLASSICALMAPFACTORY_SHORT +typedef MueLu::ClassicalMapFactory ClassicalMapFactory; +#endif +#ifdef MUELU_CLASSICALPFACTORY_SHORT +typedef MueLu::ClassicalPFactory ClassicalPFactory; +#endif #ifdef MUELU_CLONEREPARTITIONINTERFACE_SHORT typedef MueLu::CloneRepartitionInterface CloneRepartitionInterface; #endif diff --git a/packages/muelu/src/Interface/MueLu_FactoryFactory_decl.hpp b/packages/muelu/src/Interface/MueLu_FactoryFactory_decl.hpp index efd88fdd6895..af6c4664b5ac 100644 --- a/packages/muelu/src/Interface/MueLu_FactoryFactory_decl.hpp +++ b/packages/muelu/src/Interface/MueLu_FactoryFactory_decl.hpp @@ -80,6 +80,8 @@ #include "MueLu_BlockedRAPFactory.hpp" #include "MueLu_BraessSarazinSmoother.hpp" #include "MueLu_BrickAggregationFactory.hpp" +#include "MueLu_ClassicalMapFactory.hpp" +#include "MueLu_ClassicalPFactory.hpp" #include "MueLu_CloneRepartitionInterface.hpp" #include "MueLu_CoalesceDropFactory.hpp" #include "MueLu_SmooVecCoalesceDropFactory.hpp" @@ -239,6 +241,8 @@ namespace MueLu { if (factoryName == "BlockedCoordinatesTransferFactory") return Build2 (paramList, factoryMapIn, factoryManagersIn); if (factoryName == "BlockedRAPFactory") return BuildRAPFactory (paramList, factoryMapIn, factoryManagersIn); if (factoryName == "BrickAggregationFactory") return Build2 (paramList, factoryMapIn, factoryManagersIn); + if (factoryName == "ClassicalMapFactory") return Build2 (paramList, factoryMapIn, factoryManagersIn); + if (factoryName == "ClassicalPFactory") return Build2 (paramList, factoryMapIn, factoryManagersIn); if (factoryName == "CloneRepartitionInterface") return Build2 (paramList, factoryMapIn, factoryManagersIn); if (factoryName == "CoarseMapFactory") return Build2 (paramList, factoryMapIn, factoryManagersIn); if (factoryName == "CoarseningVisualizationFactory") return Build2 (paramList, factoryMapIn, factoryManagersIn); diff --git a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_decl.hpp b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_decl.hpp index b011afc7efef..6047eef91c97 100644 --- a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_decl.hpp +++ b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_decl.hpp @@ -56,6 +56,8 @@ #include "MueLu_AggregationExportFactory_fwd.hpp" #include "MueLu_BrickAggregationFactory_fwd.hpp" +#include "MueLu_ClassicalMapFactory_fwd.hpp" +#include "MueLu_ClassicalPFactory_fwd.hpp" #include "MueLu_CoalesceDropFactory_fwd.hpp" #include "MueLu_CoarseMapFactory_fwd.hpp" #include "MueLu_ConstraintFactory_fwd.hpp" @@ -231,7 +233,7 @@ namespace MueLu { int levelID, std::vector& keeps, RCP & nullSpaceFactory) const; void UpdateFactoryManager_BlockNumber(Teuchos::ParameterList& paramList, const Teuchos::ParameterList& defaultList, FactoryManager& manager,int levelID, std::vector& keeps) const; - void UpdateFactoryManager_LocalOrdinalTransfer(const std::string VarName, Teuchos::ParameterList& paramList, const Teuchos::ParameterList& defaultList, + void UpdateFactoryManager_LocalOrdinalTransfer(const std::string& VarName, const std::string& multigridAlgo, Teuchos::ParameterList& paramList, const Teuchos::ParameterList& defaultList, FactoryManager& manager,int levelID, std::vector& keeps) const; // Algorithm-specific components for UpdateFactoryManager diff --git a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp index 86628cbcd1f0..accb26c24d50 100644 --- a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp +++ b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp @@ -62,6 +62,8 @@ #include "MueLu_AggregationExportFactory.hpp" #include "MueLu_AggregateQualityEstimateFactory.hpp" #include "MueLu_BrickAggregationFactory.hpp" +#include "MueLu_ClassicalMapFactory.hpp" +#include "MueLu_ClassicalPFactory.hpp" #include "MueLu_CoalesceDropFactory.hpp" #include "MueLu_CoarseMapFactory.hpp" #include "MueLu_ConstraintFactory.hpp" @@ -553,7 +555,7 @@ namespace MueLu { Exceptions::RuntimeError, "Unknown \"reuse: type\" value: \"" << reuseType << "\". Please consult User's Guide."); MUELU_SET_VAR_2LIST(paramList, defaultList, "multigrid algorithm", std::string, multigridAlgo); - TEUCHOS_TEST_FOR_EXCEPTION(strings({"unsmoothed", "sa", "pg", "emin", "matlab", "pcoarsen"}).count(multigridAlgo) == 0, + TEUCHOS_TEST_FOR_EXCEPTION(strings({"unsmoothed", "sa", "pg", "emin", "matlab", "pcoarsen","classical"}).count(multigridAlgo) == 0, Exceptions::RuntimeError, "Unknown \"multigrid algorithm\" value: \"" << multigridAlgo << "\". Please consult User's Guide."); #ifndef HAVE_MUELU_MATLAB TEUCHOS_TEST_FOR_EXCEPTION(multigridAlgo == "matlab", Exceptions::RuntimeError, @@ -615,6 +617,10 @@ namespace MueLu { // Unsmoothed aggregation manager.SetFactory("P", manager.GetFactory("Ptent")); + } else if (multigridAlgo == "classical") { + // Classical AMG + manager.SetFactory("P", manager.GetFactory("Ptent")); + } else if (multigridAlgo == "sa") { // Smoothed aggregation UpdateFactoryManager_SA(paramList, defaultList, manager, levelID, keeps); @@ -647,7 +653,7 @@ namespace MueLu { // == BlockNumber Transfer == if(useBlockNumber_) - UpdateFactoryManager_LocalOrdinalTransfer("BlockNumber",paramList,defaultList,manager,levelID,keeps); + UpdateFactoryManager_LocalOrdinalTransfer("BlockNumber",multigridAlgo,paramList,defaultList,manager,levelID,keeps); // === Coordinates === UpdateFactoryManager_Coordinates(paramList, defaultList, manager, levelID, keeps); @@ -1003,6 +1009,7 @@ namespace MueLu { ParameterList dropParams; dropParams.set("lightweight wrap", true); MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: drop scheme", std::string, dropParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: row sum drop tol", double, dropParams); MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: block diagonal: interleaved blocksize", int, dropParams); MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: drop tol", double, dropParams); MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: Dirichlet threshold", double, dropParams); @@ -1020,7 +1027,7 @@ namespace MueLu { // Aggregation scheme MUELU_SET_VAR_2LIST(paramList, defaultList, "aggregation: type", std::string, aggType); - TEUCHOS_TEST_FOR_EXCEPTION(!strings({"uncoupled", "coupled", "brick", "matlab","notay"}).count(aggType), + TEUCHOS_TEST_FOR_EXCEPTION(!strings({"uncoupled", "coupled", "brick", "matlab","notay","classical"}).count(aggType), Exceptions::RuntimeError, "Unknown aggregation algorithm: \"" << aggType << "\". Please consult User's Guide."); #ifndef HAVE_MUELU_MATLAB if (aggType == "matlab") @@ -1078,6 +1085,40 @@ namespace MueLu { aggFactory->SetFactory("Coordinates", this->GetFactoryManager(levelID-1)->GetFactory("Coordinates")); } } + else if (aggType == "classical") { + // Map and coloring + RCP mapFact = rcp(new ClassicalMapFactory()); + ParameterList mapParams; + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: deterministic", bool, mapParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: coloring algorithm", std::string, mapParams); + mapFact->SetParameterList(mapParams); + manager.SetFactory("FC Splitting", mapFact); + manager.SetFactory("CoarseMap", mapFact); + + aggFactory = rcp(new ClassicalPFactory()); + ParameterList aggParams; + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: classical scheme", std::string, aggParams); + MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: drop scheme", std::string, aggParams); + aggFactory->SetParameterList(aggParams); + aggFactory->SetFactory("FC Splitting",manager.GetFactory("FC Splitting")); + aggFactory->SetFactory("CoarseMap",manager.GetFactory("CoarseMap")); + aggFactory->SetFactory("DofsPerNode", manager.GetFactory("Graph")); + aggFactory->SetFactory("Graph", manager.GetFactory("Graph")); + std::string drop_algo = aggParams.get("aggregation: drop scheme"); + if (drop_algo.find("block diagonal") != std::string::npos) + aggFactory->SetFactory("BlockNumber", manager.GetFactory("BlockNumber")); + + // Now we short-circuit, because we neither need nor want TentativePFactory here + manager.SetFactory("Ptent", aggFactory); + manager.SetFactory("P Graph", aggFactory); + + + if (reuseType == "tP" && levelID) { + // keeps.push_back(keep_pair("Nullspace", Ptent.get())); + keeps.push_back(keep_pair("Ptent",aggFactory.get())); + } + return; + } #ifdef HAVE_MUELU_KOKKOS_REFACTOR else if (aggType == "notay") { aggFactory = rcp(new NotayAggregationFactory()); @@ -1101,6 +1142,7 @@ namespace MueLu { #endif + manager.SetFactory("Aggregates", aggFactory); // Coarse map @@ -1307,12 +1349,15 @@ namespace MueLu { // ===================================================================================================== template void ParameterListInterpreter:: - UpdateFactoryManager_LocalOrdinalTransfer(const std::string VarName, ParameterList& paramList, const ParameterList& /* defaultList */, + UpdateFactoryManager_LocalOrdinalTransfer(const std::string & VarName, const std::string &multigridAlgo,ParameterList& paramList, const ParameterList& /* defaultList */, FactoryManager& manager, int levelID, std::vector& /* keeps */) const { if(levelID >= 1){ - RCP fact = rcp(new LocalOrdinalTransferFactory(VarName)); - fact->SetFactory("Aggregates", manager.GetFactory("Aggregates")); + RCP fact = rcp(new LocalOrdinalTransferFactory(VarName,multigridAlgo)); + if(multigridAlgo == "classical") + fact->SetFactory("P Graph", manager.GetFactory("P Graph")); + else + fact->SetFactory("Aggregates", manager.GetFactory("Aggregates")); fact->SetFactory("CoarseMap", manager.GetFactory("CoarseMap")); fact->SetFactory(VarName, this->GetFactoryManager(levelID-1)->GetFactory(VarName)); diff --git a/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_decl.hpp b/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_decl.hpp index 5f3316c7415a..2b1521434046 100644 --- a/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_decl.hpp +++ b/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_decl.hpp @@ -50,7 +50,7 @@ #include "MueLu_TwoLevelFactoryBase.hpp" #include "Xpetra_MultiVector_fwd.hpp" #include "Xpetra_MultiVectorFactory_fwd.hpp" -#include "Xpetra_Matrix.hpp" +#include "Xpetra_CrsGraph_fwd.hpp" #include "MueLu_CoarseMapFactory_fwd.hpp" #include "MueLu_LocalOrdinalTransferFactory_fwd.hpp" @@ -92,6 +92,9 @@ namespace MueLu { ----------|--------------|------------ | TransferVec | LocalOrdinalTransferFactory | coarse level transfervec */ + + + template @@ -99,7 +102,7 @@ namespace MueLu { #undef MUELU_LOCALORDINALTRANSFERFACTORY_SHORT #include "MueLu_UseShortNamesOrdinal.hpp" - public: + public: //! @name Constructors/Destructors. //@{ @@ -114,7 +117,10 @@ namespace MueLu { The operator associated with projectionName will be applied to the MultiVector associated with vectorName. */ - LocalOrdinalTransferFactory(const std::string TransferVecName): TransferVecName_(TransferVecName) { } + LocalOrdinalTransferFactory(const std::string & TransferVecName, const std::string & mode): TransferVecName_(TransferVecName) { + if(mode == "classical") useAggregatesMode_ = false; + else useAggregatesMode_ = true; + } //! Destructor. virtual ~LocalOrdinalTransferFactory() { } @@ -144,8 +150,13 @@ namespace MueLu { //@} private: - + void BuildAggregates(Level & fineLevel, Level &coarseLevel) const; + + void BuildFC(Level & fineLevel, Level &coarseLevel) const; + + //! Use aggregates mode (as opposed to FC mode) + bool useAggregatesMode_; //! The name for the vector to be transfered. This allows us to have multiple factories for different variables std::string TransferVecName_; diff --git a/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_def.hpp b/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_def.hpp index 5d6474820ba5..74ce2d67ed7e 100644 --- a/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_def.hpp +++ b/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_def.hpp @@ -49,6 +49,8 @@ #include "Xpetra_ImportFactory.hpp" #include "Xpetra_VectorFactory.hpp" #include "Xpetra_MapFactory.hpp" +#include "Xpetra_CrsGraph.hpp" + #include "Xpetra_IO.hpp" #include "MueLu_CoarseMapFactory.hpp" @@ -64,7 +66,8 @@ namespace MueLu { RCP LocalOrdinalTransferFactory::GetValidParameterList() const { RCP validParamList = rcp(new ParameterList()); - validParamList->set >(TransferVecName_, Teuchos::null, "Factory for TransferVec generation"); + validParamList->set >(TransferVecName_, Teuchos::null, "Factory for TransferVec generation"); + validParamList->set >("P Graph", Teuchos::null, "Factory for P generation"); validParamList->set >("Aggregates", Teuchos::null, "Factory for aggregates generation"); validParamList->set >("CoarseMap", Teuchos::null, "Generating factory of the coarse map"); @@ -78,8 +81,13 @@ namespace MueLu { isAvailableXfer = coarseLevel.IsAvailable(TransferVecName_, this); if (isAvailableXfer == false) { Input(fineLevel, TransferVecName_); - Input(fineLevel, "Aggregates"); Input(fineLevel, "CoarseMap"); + + if(useAggregatesMode_) + Input(fineLevel, "Aggregates"); + else { + Input(coarseLevel, "P Graph"); + } } } @@ -87,6 +95,73 @@ namespace MueLu { template void LocalOrdinalTransferFactory::Build(Level & fineLevel, Level &coarseLevel) const { + if(useAggregatesMode_) BuildAggregates(fineLevel,coarseLevel); + else BuildFC(fineLevel,coarseLevel); + } + + template + void LocalOrdinalTransferFactory::BuildFC(Level & fineLevel, Level &coarseLevel) const { + FactoryMonitor m(*this, "Build", coarseLevel); + + GetOStream(Runtime0) << "Transferring " <::invalid(); + + if (coarseLevel.IsAvailable(TransferVecName_, this)) { + GetOStream(Runtime0) << "Reusing "< P = Get< RCP >(coarseLevel,"P Graph"); + RCP fineTV = Get< RCP >(fineLevel, TransferVecName_); + RCP coarseMap = Get< RCP > (fineLevel, "CoarseMap"); + RCP uniqueMap = fineTV->getMap(); + ArrayRCP fineData = fineTV->getData(0); + + // FIXME: Handle MPI parallel + // Sanity checks + TEUCHOS_TEST_FOR_EXCEPTION(P->getRowMap()->getComm()->getSize() != 1,Exceptions::RuntimeError,"BuildFC: Only currently supports 1 MPI rank."); + + // Allocate new LO Vector + RCP coarseTV = LocalOrdinalVectorFactory::Build(coarseMap,1); + ArrayRCP coarseData = coarseTV->getDataNonConst(0); + + // Invalidate everything first, to check for errors + for(LO i=0; igetNodeNumRows(); row++) { + LO fineNumber = fineData[row]; + ArrayView indices; + P->getLocalRowView(row,indices); + + // FIXME: MPI parallel + for(LO j=0; j<(LO)indices.size(); j++) { + if(coarseData[indices[j]] == LO_INVALID) + coarseData[indices[j]] = fineNumber; + else if (coarseData[indices[j]] != fineNumber) + error_count++; + } + + } + + // Error checking: All nodes in an aggregate must share a local ordinal + if(error_count > 0) { + std::ostringstream ofs; + ofs << "LocalOrdinalTransferFactory("< >(coarseLevel, TransferVecName_, coarseTV); + + } + + + + template + void LocalOrdinalTransferFactory::BuildAggregates(Level & fineLevel, Level &coarseLevel) const { FactoryMonitor m(*this, "Build", coarseLevel); GetOStream(Runtime0) << "Transferring " <" "" "" + "" + "" "" "" "" @@ -284,7 +286,6 @@ namespace MueLu { "" "" "" - "" "" "" "" @@ -605,6 +606,10 @@ namespace MueLu { ("aggregation: drop scheme","aggregation: drop scheme") + ("aggregation: classical scheme","aggregation: classical scheme") + + ("aggregation: row sum drop tol","aggregation: row sum drop tol") + ("aggregation: block diagonal: interleaved blocksize","aggregation: block diagonal: interleaved blocksize") ("aggregation: number of random vectors","aggregation: number of random vectors") @@ -769,8 +774,6 @@ namespace MueLu { ("not supported by ML","sa: rowsumabs diagonal replacement value") - ("interp: interpolation order","interp: interpolation order") - ("interp: build coarse coordinates","interp: build coarse coordinates") ("transfer: params","transfer: params") diff --git a/packages/muelu/src/Transfers/Classical/MueLu_ClassicalMapFactory_decl.hpp b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalMapFactory_decl.hpp new file mode 100644 index 000000000000..dd6a34b9a227 --- /dev/null +++ b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalMapFactory_decl.hpp @@ -0,0 +1,147 @@ +// @HEADER +// +// *********************************************************************** +// +// MueLu: A package for multigrid based preconditioning +// Copyright 2012 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact +// Jonathan Hu (jhu@sandia.gov) +// Andrey Prokopenko (aprokop@sandia.gov) +// Ray Tuminaro (rstumin@sandia.gov) +// +// *********************************************************************** +// +// @HEADER + +#ifndef MUELU_CLASSICALMAPFACTORY_DECL_HPP_ +#define MUELU_CLASSICALMAPFACTORY_DECL_HPP_ + +#include "Xpetra_StridedMapFactory_fwd.hpp" +#include "Xpetra_Import_fwd.hpp" +#include "Xpetra_Vector_fwd.hpp" +#include "Xpetra_VectorFactory_fwd.hpp" + +#include "MueLu_ConfigDefs.hpp" +#include "MueLu_SingleLevelFactoryBase.hpp" +#include "MueLu_ClassicalMapFactory_fwd.hpp" +#include "MueLu_GraphBase_fwd.hpp" +#include "MueLu_Level_fwd.hpp" +#include "MueLu_Exceptions.hpp" +#include "MueLu_Graph_fwd.hpp" +#include "MueLu_LWGraph_fwd.hpp" +#ifdef HAVE_MUELU_KOKKOSCORE +#include "MueLu_LWGraph_kokkos_fwd.hpp" +#endif + +namespace MueLu { + + /*! + @class ClassicalMapFactory class. + @brief Factory for generating F/C-splitting and a coarse level map. Used by ClassicalPFactory. + + @ingroup MueLuTransferClasses + + ## Input/output ## + + ### User parameters of this factory ### + Parameter | type | default | master.xml | validated | requested | description + ----------|------|---------|:----------:|:---------:|:---------:|------------ + Graph | Factory | null | | * | * | Generating factory for graph. + The * in the @c master.xml column denotes that the parameter is defined in the @c master.xml file.
+ The * in the @c validated column means that the parameter is declared in the list of valid input parameters (see @c GetValidParameters() ).
+ The * in the @c requested column states that the data is requested as input with all dependencies (see @c DeclareInput() ). + + + ### Variables provided by this factory ### + + After @c Build() the following data is available (if requested) + + Parameter | generated by | description + ----------|--------------|------------ + | Colors | ClassicalMapFactory | ArrayRCP of colors + | CoarseMap | CoarseMapFactory | Map containing the coarse map used as domain map in the classical prolongator + + */ + + template + class ClassicalMapFactory : public SingleLevelFactoryBase { +#undef MUELU_CLASSICALMAPFACTORY_SHORT +#include "MueLu_UseShortNames.hpp" + + public: + //! F/C/Dirichlet point type + typedef enum {F_PT=-1, UNASSIGNED=0, C_PT=1, DIRICHLET_PT=2} point_type; + + //! @name Input + //@{ + + RCP GetValidParameterList() const override; + + /*! + @brief Specifies the data that this class needs, and the factories that generate that data. + + If the Build method of this class requires some data, but the generating factory is not specified in DeclareInput, + then this class will fall back to the settings in FactoryManager. + */ + void DeclareInput(Level ¤tLevel) const override; + + //@} + + //! @name Build methods. + //@{ + + //! Build an object with this factory. + void Build(Level ¤tLevel) const override; + + //@} + + + protected: + virtual void GenerateCoarseMap(const Map & fineMap, LO num_c_points, Teuchos::RCP & coarseMap) const; + + virtual void DoGraphColoring(const GraphBase & graph, Teuchos::ArrayRCP & myColors, LO & numColors) const; + + virtual void DoMISNaive(const GraphBase & graph, Teuchos::ArrayRCP & myColors, LO & numColors) const; + + virtual void DoDistributedGraphColoring(RCP & graph, Teuchos::ArrayRCP & myColors, LO & numColors) const; + + }; //class ClassicalMapFactory + +} //namespace MueLu + +#define MUELU_CLASSICALMAPFACTORY_SHORT +#endif /* MUELU_CLASSICALMAPFACTORY_DECL_HPP_ */ diff --git a/packages/muelu/src/Transfers/Classical/MueLu_ClassicalMapFactory_def.hpp b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalMapFactory_def.hpp new file mode 100644 index 000000000000..38ef97ae11fa --- /dev/null +++ b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalMapFactory_def.hpp @@ -0,0 +1,498 @@ +// @HEADER +// +// *********************************************************************** +// +// MueLu: A package for multigrid based preconditioning +// Copyright 2012 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact +// Jonathan Hu (jhu@sandia.gov) +// Andrey Prokopenko (aprokop@sandia.gov) +// Ray Tuminaro (rstumin@sandia.gov) +// +// *********************************************************************** +// +// @HEADER + +#ifndef MUELU_CLASSICALMAPFACTORY_DEF_HPP_ +#define MUELU_CLASSICALMAPFACTORY_DEF_HPP_ + + + +#include +#include + + +#ifdef HAVE_MPI +#include +#endif + +#include +#include +#include +#include +#include + +#include "MueLu_ClassicalMapFactory_decl.hpp" +#include "MueLu_Level.hpp" +#include "MueLu_GraphBase.hpp" +#include "MueLu_MasterList.hpp" +#include "MueLu_Monitor.hpp" +#include "MueLu_GraphBase.hpp" +#include "MueLu_Graph.hpp" +#include "MueLu_LWGraph.hpp" + +#ifdef HAVE_MUELU_ZOLTAN2 +#include "MueLu_Zoltan2GraphAdapter.hpp" +#include +#include +#include + +#endif + +// NOTE: We should be checking for KokkosKernels here, but +// MueLu doesn't have a macro for that +#ifdef HAVE_MUELU_KOKKOSCORE +#include "MueLu_LWGraph_kokkos.hpp" +#include +#include +#endif + +namespace MueLu { + + template + RCP ClassicalMapFactory::GetValidParameterList() const + { + RCP validParamList = rcp(new ParameterList()); +#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name)) + SET_VALID_ENTRY("aggregation: deterministic"); + SET_VALID_ENTRY("aggregation: coloring algorithm"); +#undef SET_VALID_ENTRY + validParamList->set< RCP >("A", Teuchos::null, "Generating factory of the matrix A"); + validParamList->set< RCP >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo"); + validParamList->set< RCP >("Graph", null, "Generating factory of the graph"); + validParamList->set< RCP >("DofsPerNode", null, "Generating factory for variable \'DofsPerNode\', usually the same as for \'Graph\'"); + + return validParamList; + } + + template + void ClassicalMapFactory::DeclareInput(Level ¤tLevel) const + { + Input(currentLevel, "A"); + Input(currentLevel, "UnAmalgamationInfo"); + Input(currentLevel, "Graph"); + } + + template + void ClassicalMapFactory::Build(Level ¤tLevel) const + { + FactoryMonitor m(*this, "Build", currentLevel); + RCP graph = Get >(currentLevel,"Graph"); + RCP A = Get >(currentLevel,"A"); + const ParameterList& pL = GetParameterList(); + /* ============================================================= */ + /* Phase 1 : Compute an initial MIS */ + /* ============================================================= */ + ArrayRCP myColors; + LO numColors=0; + + RCP fc_splitting; + std::string coloringAlgo = pL.get("aggregation: coloring algorithm"); + + // Switch to Zoltan2 if we're parallel and Tpetra (and not file) +#ifdef HAVE_MUELU_ZOLTAN2 + int numProcs = A->getRowMap()->getComm()->getSize(); + if(coloringAlgo!="file" && numProcs && graph->GetDomainMap()->lib() == Xpetra::UseTpetra) + coloringAlgo="Zoltan2"; +#endif + + // Switch to MIS if we're in Epetra (and not file) + if(coloringAlgo!="file" && graph->GetDomainMap()->lib() == Xpetra::UseEpetra) + coloringAlgo="MIS"; + + + if(coloringAlgo == "file") { + // Read the CF splitting from disk + // NOTE: For interoperability reasons, this is dependent on the point_type enum not changing + std::string map_file = std::string("map_fcsplitting_") + std::to_string(currentLevel.GetLevelID()) + std::string(".m"); + std::string color_file = std::string("fcsplitting_") + std::to_string(currentLevel.GetLevelID()) + std::string(".m"); + + FILE * mapfile = fopen(map_file.c_str(),"r"); + using real_type = typename Teuchos::ScalarTraits::magnitudeType; + using RealValuedMultiVector = typename Xpetra::MultiVector; + RCP mv; + + + if(mapfile) { + fclose(mapfile); + RCP colorMap = Xpetra::IO::ReadMap(map_file, A->getRowMap()->lib(), A->getRowMap()->getComm()); + TEUCHOS_TEST_FOR_EXCEPTION(!colorMap->isCompatible(*A->getRowMap()),std::invalid_argument,"Coloring on disk has incompatible map with A"); + + mv = Xpetra::IO::ReadMultiVector(color_file,colorMap); + } + else { + // Use A's rowmap and hope it matches + mv = Xpetra::IO::ReadMultiVector(color_file,A->getRowMap()); + } + TEUCHOS_TEST_FOR_EXCEPTION(mv.is_null(),std::invalid_argument,"Coloring on disk cannot be read"); + fc_splitting = LocalOrdinalVectorFactory::Build(A->getRowMap()); + TEUCHOS_TEST_FOR_EXCEPTION(mv->getLocalLength() != fc_splitting->getLocalLength(),std::invalid_argument,"Coloring map mismatch"); + + // Overlay the Dirichlet Points (and copy out the rest) + auto boundaryNodes = graph->GetBoundaryNodeMap(); + ArrayRCP mv_data= mv->getData(0); + ArrayRCP fc_data= fc_splitting->getDataNonConst(0); + for(LO i=0; i<(LO)fc_data.size(); i++) { + if(boundaryNodes[i]) + fc_data[i] = DIRICHLET_PT; + else + fc_data[i] = Teuchos::as(mv_data[i]); + } + } +#ifdef HAVE_MUELU_ZOLTAN2 + else if(coloringAlgo.find("Zoltan2")!=std::string::npos && graph->GetDomainMap()->lib() == Xpetra::UseTpetra) { + SubFactoryMonitor sfm(*this,"DistributedGraphColoring",currentLevel); + DoDistributedGraphColoring(graph,myColors,numColors); + } +#endif + else if(coloringAlgo == "MIS" || graph->GetDomainMap()->lib() == Xpetra::UseTpetra) { + SubFactoryMonitor sfm(*this,"MIS",currentLevel) +; TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getComm()->getSize() != 1, std::invalid_argument,"MIS on more than 1 MPI rank is not supported"); + DoMISNaive(*graph,myColors,numColors); + } +#ifdef HAVE_MUELU_KOKKOSCORE + else { + SubFactoryMonitor sfm(*this,"GraphColoring",currentLevel); + TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getComm()->getSize() != 1, std::invalid_argument,"KokkosKernels graph coloring on more than 1 MPI rank is not supported"); + DoGraphColoring(*graph,myColors,numColors); + } +#else + else { + TEUCHOS_TEST_FOR_EXCEPTION(true,std::invalid_argument,"Unrecognized distance 1 coloring algorithm"); + } +#endif + + + /* ============================================================= */ + /* Phase 2 : Mark the C-Points */ + /* ============================================================= */ + LO num_c_points = 0, num_d_points=0, num_f_points = 0; + if(fc_splitting.is_null()) { + // We just have a coloring, so we need to generate a splitting + auto boundaryNodes = graph->GetBoundaryNodeMap(); + fc_splitting = LocalOrdinalVectorFactory::Build(A->getRowMap()); + ArrayRCP myPointType = fc_splitting->getDataNonConst(0); + for(LO i=0; i<(LO)myColors.size(); i++) { + if(boundaryNodes[i]) { + myPointType[i] = DIRICHLET_PT; + num_d_points++; + } + else if ((LO)myColors[i] == 1) { + myPointType[i] = C_PT; + num_c_points++; + } + else + myPointType[i] = F_PT; + } + num_f_points = (LO)myColors.size() - num_d_points - num_c_points; + } + else { + // If we read the splitting off disk, we just need to count + ArrayRCP myPointType = fc_splitting->getDataNonConst(0); + + for(LO i=0; i<(LO)myPointType.size(); i++) { + if(myPointType[i] == DIRICHLET_PT) + num_d_points++; + else if (myPointType[i] == C_PT) + num_c_points++; + } + num_f_points = (LO)myPointType.size() - num_d_points - num_c_points; + } + + /* Output statistics on c/f/d points */ + if (GetVerbLevel() & Statistics1) { + // NOTE: We batch the communication here + GO l_counts[] = {(GO)num_c_points, (GO) num_f_points, (GO) num_d_points}; + GO g_counts[3]; + + RCP > comm = A->getRowMap()->getComm(); + Teuchos::reduceAll(*comm, Teuchos::REDUCE_SUM, 3, l_counts, g_counts); + GetOStream(Statistics1) << "ClassicalMapFactory: C/F/D = "< coarseMap; + { + SubFactoryMonitor sfm(*this,"Coarse Map",currentLevel); + GenerateCoarseMap(*A->getRowMap(),num_c_points,coarseMap); + } + + Set(currentLevel, "FC Splitting",fc_splitting); + Set(currentLevel, "CoarseMap", coarseMap); + + } + +/* ************************************************************************* */ +template +void ClassicalMapFactory:: +GenerateCoarseMap(const Map & fineMap, LO num_c_points, RCP & coarseMap) const { + + // FIXME: Assumes scalar PDE + std::vector stridingInfo_(1); + stridingInfo_[0]=1; + GO domainGIDOffset = 0; + + coarseMap = StridedMapFactory::Build(fineMap.lib(), + Teuchos::OrdinalTraits::invalid(), + num_c_points, + fineMap.getIndexBase(), + stridingInfo_, + fineMap.getComm(), + domainGIDOffset); +} + + + +/* ************************************************************************* */ +template +void ClassicalMapFactory:: +DoGraphColoring(const GraphBase & graph, ArrayRCP & myColors_out, LO & numColors) const { +#ifdef HAVE_MUELU_KOKKOSCORE + const ParameterList& pL = GetParameterList(); + using graph_t = typename LWGraph_kokkos::local_graph_type; + using KernelHandle = KokkosKernels::Experimental:: + KokkosKernelsHandle; + KernelHandle kh; + + // Leave gc algorithm choice as the default + kh.create_graph_coloring_handle(); + + // Get the distance-1 graph coloring handle + auto coloringHandle = kh.get_graph_coloring_handle(); + + // Set the distance-1 coloring algorithm to use + if(pL.get("aggregation: deterministic") == true) { + coloringHandle->set_algorithm( KokkosGraph::COLORING_SERIAL ); + if(IsPrint(Statistics1)) GetOStream(Statistics1) << " algorithm: serial" << std::endl; + } else if(pL.get("aggregation: coloring algorithm") == "serial") { + coloringHandle->set_algorithm( KokkosGraph::COLORING_SERIAL ); + if(IsPrint(Statistics1)) GetOStream(Statistics1) << " algorithm: serial" << std::endl; + } else if(pL.get("aggregation: coloring algorithm") == "vertex based") { + coloringHandle->set_algorithm( KokkosGraph::COLORING_VB ); + if(IsPrint(Statistics1)) GetOStream(Statistics1) << " algorithm: vertex based" << std::endl; + } else if(pL.get("aggregation: coloring algorithm") == "vertex based bit array") { + coloringHandle->set_algorithm( KokkosGraph::COLORING_VBBIT ); + if(IsPrint(Statistics1)) GetOStream(Statistics1) << " algorithm: vertex based bit array" << std::endl; + } else if(pL.get("aggregation: coloring algorithm") == "vertex based color set") { + coloringHandle->set_algorithm( KokkosGraph::COLORING_VBCS ); + if(IsPrint(Statistics1)) GetOStream(Statistics1) << " algorithm: vertex based color set" << std::endl; + } else if(pL.get("aggregation: coloring algorithm") == "vertex based deterministic") { + coloringHandle->set_algorithm( KokkosGraph::COLORING_VBD ); + if(IsPrint(Statistics1)) GetOStream(Statistics1) << " algorithm: vertex based deterministic" << std::endl; + } else if(pL.get("aggregation: coloring algorithm") == "vertex based deterministic bit array") { + coloringHandle->set_algorithm( KokkosGraph::COLORING_VBDBIT ); + if(IsPrint(Statistics1)) GetOStream(Statistics1) << " algorithm: vertex based deterministic bit array" << std::endl; + } else if(pL.get("aggregation: coloring algorithm") == "edge based") { + coloringHandle->set_algorithm( KokkosGraph::COLORING_EB ); + if(IsPrint(Statistics1)) GetOStream(Statistics1) << " algorithm: edge based" << std::endl; + } else { + TEUCHOS_TEST_FOR_EXCEPTION(true,std::invalid_argument,"Unrecognized distance 1 coloring algorithm"); + } + + // Create device views for graph rowptrs/colinds + size_t numRows = graph.GetNodeNumVertices(); + auto graphLWK = dynamic_cast(&graph); + auto graphLW = dynamic_cast(&graph); + auto graphG = dynamic_cast(&graph); + TEUCHOS_TEST_FOR_EXCEPTION(!graphLW && !graphLWK && !graphG,std::invalid_argument,"Graph is not a LWGraph or LWGraph_kokkos object"); + // Run d1 graph coloring + // Assume that the graph is symmetric so row map/entries and col map/entries are the same + + if(graphLWK) { + KokkosGraph::Experimental::graph_color(&kh, + numRows, + numRows, // FIXME: This should be the number of columns + graphLWK->getRowPtrs(), + graphLWK->getEntries(), + true); + } + else if(graphLW) { + auto rowptrs = graphLW->getRowPtrs(); + auto entries = graphLW->getEntries(); + // Copy rowptrs to a size_t, because kokkos-kernels doesn't like rowptrs as LO's + Teuchos::Array rowptrs_s(rowptrs.size()); + std::copy(rowptrs.begin(),rowptrs.end(),rowptrs_s.begin()); + Kokkos::View rowptrs_v(rowptrs_s.data(),(size_t)rowptrs.size()); + Kokkos::View entries_v(entries.getRawPtr(),(size_t)entries.size()); + KokkosGraph::Experimental::graph_color(&kh, + numRows, + numRows, // FIXME: This should be the number of columns + rowptrs_v, + entries_v, + true); + } + else if(graphG) { + // FIXME: This is a terrible, terrible hack, based on 0-based local indexing. + RCP graphC = graphG->GetGraph(); + size_t numEntries = graphC->getNodeNumEntries(); + ArrayView indices; + graphC->getLocalRowView(0,indices); + Kokkos::View rowptrs_v("rowptrs_v",graphC->getNodeNumRows()+1); + rowptrs_v[0]=0; + for(LO i=0; i<(LO)graphC->getNodeNumRows()+1; i++) + rowptrs_v[i+1] = rowptrs_v[i] + graphC->getNumEntriesInLocalRow(i); + Kokkos::View entries_v(&indices[0],numEntries); + KokkosGraph::Experimental::graph_color(&kh, + numRows, + numRows, // FIXME: This should be the number of columns + rowptrs_v, + entries_v, + true); + } + + + // Extract the colors and store them in the aggregates + auto myColors_d = coloringHandle->get_vertex_colors(); + numColors = static_cast(coloringHandle->get_num_colors()); + + // Copy back to host + auto myColors_h = Kokkos::create_mirror_view(myColors_d); + myColors_out.resize(myColors_h.size()); + Kokkos::View myColors_v(&myColors_out[0],myColors_h.size()); + Kokkos::deep_copy(myColors_v,myColors_h); + + //clean up coloring handle + kh.destroy_graph_coloring_handle(); +#else + TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError,"ClassicalMapFactory: Requires KokkosKernels"); +#endif + +}// end DoGraphColoring + + +/* ************************************************************************* */ +template +void ClassicalMapFactory:: +DoMISNaive(const GraphBase & graph, ArrayRCP & myColors, LO & numColors) const { + // This is a fall-back routine for when we don't have Kokkos or when it isn't initialized + // We just do greedy MIS because this is easy to write. + + LO LO_INVALID = Teuchos::OrdinalTraits::invalid(); + LO MIS = Teuchos::ScalarTraits::one(); + + //FIXME: Not efficient + myColors.resize(0); + myColors.resize(graph.GetNodeNumVertices(),LO_INVALID); + auto boundaryNodes = graph.GetBoundaryNodeMap(); + LO Nrows = (LO)graph.GetNodeNumVertices(); + + + for(LO row=0; row < Nrows; row++) { + if(boundaryNodes[row]) + continue; + ArrayView indices = graph.getNeighborVertices(row); + bool has_colored_neighbor=false; + for(LO j=0; !has_colored_neighbor && j<(LO)indices.size(); j++) { + // FIXME: This does not handle ghosting correctly + if(myColors[indices[j]] == MIS) + has_colored_neighbor=true; + } + if(!has_colored_neighbor) + myColors[row] = MIS; + } + numColors=1; +} + + +/* ************************************************************************* */ +template +void ClassicalMapFactory:: +DoDistributedGraphColoring(RCP & graph, ArrayRCP & myColors_out, LO & numColors) const { +#ifdef HAVE_MUELU_ZOLTAN2 + // const ParameterList& pL = GetParameterList(); + Teuchos::ParameterList params; + params.set("color_choice","FirstFit"); + params.set("color_method","D1"); + // params.set("color_choice", colorMethod); + // params.set("color_method", colorAlg); + // params.set("verbose", verbose); + // params.set("serial_threshold",serialThreshold); + //params.set("recolor_degrees",recolorDegrees); + + // Do the coloring via Zoltan2 + using GraphAdapter = MueLuGraphBaseAdapter; + GraphAdapter z_adapter(graph); + + // We need to provide the MPI Comm, or else we wind up using the default (eep!) + Zoltan2::ColoringProblem problem(&z_adapter,¶ms,graph->GetDomainMap()->getComm()); + problem.solve(); + Zoltan2::ColoringSolution * soln = problem.getSolution(); + ArrayRCP colors = soln->getColorsRCP(); + numColors = (LO)soln->getNumColors(); + + // Assign the Array RCP or Copy Out + // FIXME: This probably won't work if LO!=int + if(std::is_same::value) + myColors_out = colors; + else { + myColors_out.resize(colors.size()); + for(LO i=0; i<(LO)myColors_out.size(); i++) + myColors_out[i] = (LO) colors[i]; + } + + /* + + printf("CMS: numColors = %d\ncolors = ",numColors); + for(int i=0;i +#include +#include +#include +#include +#include +#include + +#include "MueLu_ConfigDefs.hpp" +#include "MueLu_PerfUtils_fwd.hpp" +#include "MueLu_PFactory.hpp" +#include "MueLu_ClassicalPFactory_fwd.hpp" +#include "MueLu_ClassicalMapFactory_fwd.hpp" +#include "MueLu_Utilities_fwd.hpp" +#include "MueLu_CoarseMapFactory_fwd.hpp" +#include "MueLu_AmalgamationInfo_fwd.hpp" +#include "MueLu_GraphBase_fwd.hpp" +#include "MueLu_Level_fwd.hpp" + +namespace MueLu { + + template + class ClassicalPFactory : public PFactory { +#undef MUELU_CLASSICALPFACTORY_SHORT +#include "MueLu_UseShortNames.hpp" + + public: + // Defining types that require the short names included above + using point_type = typename ClassicalMapFactory::point_type; + + //! @name Constructors/Destructors. + //@{ + + //! Constructor + ClassicalPFactory() { } + + //! Destructor. + virtual ~ClassicalPFactory() { } + //@} + + RCP GetValidParameterList() const; + + //! Input + //@{ + + void DeclareInput(Level& fineLevel, Level& coarseLevel) const; + + //@} + + //! @name Build methods. + //@{ + + void Build (Level& fineLevel, Level& coarseLevel) const; + void BuildP(Level& fineLevel, Level& coarseLevel) const; + + private: + + // Utility algorithms + void GenerateStrengthFlags(const Matrix & A,const GraphBase & graph, Teuchos::Array & eis_rowptr, Teuchos::Array & edgeIsStrong) const; + + // Ghosting Algorithms + void GhostCoarseMap(const Matrix &A,const Import & Importer, const ArrayRCP myPointType,const RCP & coarseMap, RCP & coarseColMap) const; + + // Coarsening algorithms + void Coarsen_ClassicalModified(const Matrix & A,const RCP & Aghost, const GraphBase & graph, RCP & coarseColMap, RCP & coarseDomainMap, LO num_c_points, LO num_f_points, const Teuchos::ArrayView & myPointType, const Teuchos::ArrayView & myPointType_ghost, const Teuchos::Array & cpoint2pcol, const Teuchos::Array & pcol2cpoint, Teuchos::Array & eis_rowptr, Teuchos::Array & edgeIsStrong, RCP & BlockNumber, RCP remoteOnlyImporter, RCP & P) const; + void Coarsen_Direct(const Matrix & A,const RCP & Aghost, const GraphBase & graph, RCP & coarseColMap, RCP & coarseDomainMap, LO num_c_points, LO num_f_points, const Teuchos::ArrayView & myPointType, const Teuchos::ArrayView & myPointType_ghost, const Teuchos::Array & cpoint2pcol, const Teuchos::Array & pcol2cpoint, Teuchos::Array & eis_rowptr, Teuchos::Array & edgeIsStrong, RCP & BlockNumber, RCP & P) const; + void Coarsen_Ext_Plus_I(const Matrix & A,const RCP & Aghost, const GraphBase & graph, RCP & coarseColMap, RCP & coarseDomainMap, LO num_c_points, LO num_f_points, const Teuchos::ArrayView & myPointType, const Teuchos::ArrayView & myPointType_ghost, const Teuchos::Array & cpoint2pcol, const Teuchos::Array & pcol2cpoint, Teuchos::Array & eis_rowptr, Teuchos::Array & edgeIsStrong, RCP & BlockNumber, RCP & P) const; + + //@} + + }; //class ClassicalPFactory + +} //namespace MueLu + +#define MUELU_CLASSICALPFACTORY_SHORT +#endif // MUELU_CLASSICALPFACTORY_DECL_HPP diff --git a/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_def.hpp b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_def.hpp new file mode 100644 index 000000000000..4e4860af590e --- /dev/null +++ b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_def.hpp @@ -0,0 +1,818 @@ +// @HEADER +// +// *********************************************************************** +// +// MueLu: A package for multigrid based preconditioning +// Copyright 2012 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact +// Jonathan Hu (jhu@sandia.gov) +// Andrey Prokopenko (aprokop@sandia.gov) +// Ray Tuminaro (rstumin@sandia.gov) +// +// *********************************************************************** +// +// @HEADER +#ifndef MUELU_CLASSICALPFACTORY_DEF_HPP +#define MUELU_CLASSICALPFACTORY_DEF_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "MueLu_MasterList.hpp" +#include "MueLu_Monitor.hpp" +#include "MueLu_PerfUtils.hpp" +#include "MueLu_ClassicalPFactory_decl.hpp" +#include "MueLu_ClassicalMapFactory.hpp" +#include "MueLu_Utilities.hpp" +#include "MueLu_AmalgamationInfo.hpp" +#include "MueLu_GraphBase.hpp" + + +//#define CMS_DEBUG +//#define CMS_DUMP + +namespace { + +template +int Sign(SC val) { + using STS = typename Teuchos::ScalarTraits; + typename STS::magnitudeType MT_ZERO = Teuchos::ScalarTraits::zero(); + if(STS::real(val) > MT_ZERO) return 1; + else if(STS::real(val) < MT_ZERO) return -1; + else return 0; +} + +}// anonymous namepsace + +namespace MueLu { + + + + + template + RCP ClassicalPFactory::GetValidParameterList() const { + RCP validParamList = rcp(new ParameterList()); +#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name)) + SET_VALID_ENTRY("aggregation: deterministic"); + SET_VALID_ENTRY("aggregation: coloring algorithm"); + SET_VALID_ENTRY("aggregation: classical scheme"); + + // To know if we need BlockNumber + SET_VALID_ENTRY("aggregation: drop scheme"); + { + typedef Teuchos::StringToIntegralParameterEntryValidator validatorType; + validParamList->getEntry("aggregation: classical scheme").setValidator(rcp(new validatorType(Teuchos::tuple("direct","ext+i","classical modified"), "aggregation: classical scheme"))); + + } + +#undef SET_VALID_ENTRY + validParamList->set< RCP >("A", Teuchos::null, "Generating factory of the matrix A"); + validParamList->set< RCP >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo"); + validParamList->set< RCP >("Graph", null, "Generating factory of the graph"); + validParamList->set< RCP >("DofsPerNode", null, "Generating factory for variable \'DofsPerNode\', usually the same as for \'Graph\'"); + validParamList->set< RCP >("CoarseMap", Teuchos::null, "Generating factory of the CoarseMap"); + validParamList->set< RCP >("FC Splitting", Teuchos::null, "Generating factory of the FC Splitting"); + validParamList->set< RCP >("BlockNumber", Teuchos::null, "Generating factory for Block Number"); + // validParamList->set< RCP >("Nullspace", Teuchos::null, "Generating factory of the nullspace"); + + return validParamList; + } + + template + void ClassicalPFactory::DeclareInput(Level& fineLevel, Level& /* coarseLevel */) const { + Input(fineLevel, "A"); + Input(fineLevel, "Graph"); + Input(fineLevel, "DofsPerNode"); + Input(fineLevel, "UnAmalgamationInfo"); + Input(fineLevel, "DofsPerNode"); + Input(fineLevel, "CoarseMap"); + Input(fineLevel, "FC Splitting"); + + const ParameterList& pL = GetParameterList(); + std::string drop_algo = pL.get("aggregation: drop scheme"); + if (drop_algo.find("block diagonal") != std::string::npos) { + Input(fineLevel, "BlockNumber"); + } + + } + + template + void ClassicalPFactory::Build(Level& fineLevel, Level& coarseLevel) const { + return BuildP(fineLevel, coarseLevel); + } + + template + void ClassicalPFactory::BuildP(Level& fineLevel, Level& coarseLevel) const { + FactoryMonitor m(*this, "Build", coarseLevel); + using STS = Teuchos::ScalarTraits; + + // We start by assuming that someone did a reasonable strength of connection + // algorithm before we start to get our Graph, DofsPerNode and UnAmalgamationInfo + + // We begin by getting a MIS (from a graph coloring) and then at that point we need + // to start generating entries for the prolongator. + RCP A = Get< RCP >(fineLevel, "A"); + RCP ownedCoarseMap = Get >(fineLevel,"CoarseMap"); + RCP owned_fc_splitting = Get >(fineLevel,"FC Splitting"); + RCP graph = Get< RCP >(fineLevel, "Graph"); + // LO nDofsPerNode = Get(fineLevel, "DofsPerNode"); + RCP amalgInfo = Get< RCP > (fineLevel, "UnAmalgamationInfo"); + RCP Importer = A->getCrsGraph()->getImporter(); + Xpetra::UnderlyingLib lib = ownedCoarseMap->lib(); + + // RCP fineNullspace = Get< RCP > (fineLevel, "Nullspace"); + RCP P; + // SC SC_ZERO = STS::zero(); + LO LO_INVALID = Teuchos::OrdinalTraits::invalid(); + const point_type C_PT = ClassicalMapFactory::C_PT; + const point_type F_PT = ClassicalMapFactory::F_PT; + const ParameterList& pL = GetParameterList(); + + // FIXME: This guy doesn't work right now for NumPDEs != 1 + TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() != 1, Exceptions::RuntimeError,"ClassicalPFactory: Multiple PDEs per node not supported yet"); + + // FIXME: This does not work in parallel yet +// TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getComm()->getSize() != 1,Exceptions::RuntimeError,"ClassicalPFactory: MPI Ranks > 1 not supported yet"); + + // NOTE: Let's hope we never need to deal with this case + TEUCHOS_TEST_FOR_EXCEPTION(!A->getRowMap()->isSameAs(*A->getDomainMap()),Exceptions::RuntimeError,"ClassicalPFactory: MPI Ranks > 1 not supported yet"); + + + // Do we need ghosts rows of A and myPointType? + std::string scheme = pL.get("aggregation: classical scheme"); + bool need_ghost_rows =false; + if(scheme == "ext+i") + need_ghost_rows=true; + else if(scheme == "direct") + need_ghost_rows=false; + else if(scheme == "classical modified") + need_ghost_rows=true; + // NOTE: ParameterList validator will check this guy so we don't really need an "else" here + + + // Ghost the FC splitting and grab the data (if needed) + RCP fc_splitting; + ArrayRCP myPointType; + if(Importer.is_null()) { + fc_splitting = owned_fc_splitting; + } + else { + RCP fc_splitting_nonconst = LocalOrdinalVectorFactory::Build(A->getCrsGraph()->getColMap()); + fc_splitting_nonconst->doImport(*owned_fc_splitting,*Importer,Xpetra::INSERT); + fc_splitting = fc_splitting_nonconst; + } + myPointType = fc_splitting->getData(0); + + + /* Ghost A (if needed) */ + RCP Aghost; + RCP fc_splitting_ghost; + ArrayRCP myPointType_ghost; + RCP remoteOnlyImporter; + if(need_ghost_rows && !Importer.is_null()){ + ArrayView remoteLIDs = Importer->getRemoteLIDs(); + size_t numRemote = Importer->getNumRemoteIDs(); + Array remoteRows(numRemote); + for (size_t i = 0; i < numRemote; i++) + remoteRows[i] = Importer->getTargetMap()->getGlobalElement(remoteLIDs[i]); + + RCP remoteRowMap = MapFactory::Build(lib,Teuchos::OrdinalTraits::invalid(), remoteRows(), + A->getDomainMap()->getIndexBase(), A->getDomainMap()->getComm()); + + remoteOnlyImporter = Importer->createRemoteOnlyImport(remoteRowMap); + RCP Acrs = rcp_dynamic_cast(A)->getCrsMatrix(); + RCP Aghost_crs = CrsMatrixFactory::Build(Acrs,*remoteOnlyImporter,A->getDomainMap(),remoteOnlyImporter->getTargetMap()); + Aghost = rcp(new CrsMatrixWrap(Aghost_crs)); + // We also may need need to ghost myPointType for Aghos + RCP Importer2 = Aghost->getCrsGraph()->getImporter(); + if(Importer2.is_null()) { + RCP fc_splitting_ghost_nonconst = LocalOrdinalVectorFactory::Build(Aghost->getColMap()); + fc_splitting_ghost_nonconst->doImport(*owned_fc_splitting,*Importer,Xpetra::INSERT); + fc_splitting_ghost = fc_splitting_ghost_nonconst; + myPointType_ghost = fc_splitting_ghost->getData(0); + } + /* +#if OLD_AND_BUSTED + if(lib == Xpetra::UseEpetra) { +#ifdef HAVE_MUELU_EPETRA + RCP Ecrs = rcp(new EpetraCrsMatrix(Acrs,*remoteOnlyImporter,A->getDomainMap(),remoteOnlyImporter->getTargetMap())); + Aghost = rcp(new CrsMatrixWrap(Ecrs)); + RCP Importer2 = Ecrs->getCrsGraph()->getImporter(); + if(Importer2.is_null()) { + RCP fc_splitting_ghost_nonconst = LocalOrdinalVectorFactory::Build(Ecrs->getColMap()); + fc_splitting_ghost_nonconst->doImport(*owned_fc_splitting,*Importer,Xpetra::INSERT); + fc_splitting_ghost = fc_splitting_ghost_nonconst; + myPointType_ghost = fc_splitting_ghost->getData(0); + } +#endif + } + else { +#ifdef HAVE_MUELU_TPETRA + RCP Tcrs = rcp(new TpetraCrsMatrix(Acrs,*remoteOnlyImporter,A->getDomainMap(),remoteOnlyImporter->getTargetMap())); + Aghost = rcp(new CrsMatrixWrap(Tcrs)); + // We also need to ghost myPointType for Aghost, if we've created an Aghost + RCP Importer2 = Tcrs->getCrsGraph()->getImporter(); + if(Importer2.is_null()) { + RCP fc_splitting_ghost_nonconst = LocalOrdinalVectorFactory::Build(Tcrs->getColMap()); + fc_splitting_ghost_nonconst->doImport(*owned_fc_splitting,*Importer,Xpetra::INSERT); + fc_splitting_ghost = fc_splitting_ghost_nonconst; + myPointType_ghost = fc_splitting_ghost->getData(0); + } +#endif +#endif + } + */ + } + + + + /* Generate the ghosted Coarse map using the "Tuminaro maneuver" (if needed)*/ + RCP coarseMap; + if(Importer.is_null()) + coarseMap = ownedCoarseMap; + else { + // Generate a domain vector with the coarse ID's as entries for C points + GhostCoarseMap(*A,*Importer,myPointType,ownedCoarseMap,coarseMap); + } + + + // Get the block number, if we need it (and ghost it) + RCP BlockNumber; + std::string drop_algo = pL.get("aggregation: drop scheme"); + if (drop_algo.find("block diagonal") != std::string::npos) { + RCP OwnedBlockNumber; + OwnedBlockNumber = Get >(fineLevel, "BlockNumber"); + if(Importer.is_null()) + BlockNumber = OwnedBlockNumber; + else{ + BlockNumber = LocalOrdinalVectorFactory::Build(A->getRowMap()); + BlockNumber->doImport(*OwnedBlockNumber,*Importer,Xpetra::INSERT); + } + } + +#if defined(CMS_DEBUG) || defined(CMS_DUMP) + { + std::ofstream ofs(std::string("dropped_graph_") + std::to_string(fineLevel.GetLevelID()) + std::string(".dat"),std::ofstream::out); + RCP fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs)); + graph->print(*fancy,Debug); + std::string out_fc = std::string("fc_splitting_") + std::to_string(fineLevel.GetLevelID()) + std::string(".dat"); + + // We don't support writing LO vectors in Xpetra (boo!) so.... + using real_type = typename Teuchos::ScalarTraits::magnitudeType; + using RealValuedMultiVector = typename Xpetra::MultiVector; + typedef Xpetra::MultiVectorFactory RealValuedMultiVectorFactory; + + RCP mv = RealValuedMultiVectorFactory::Build(fc_splitting->getMap(),1); + ArrayRCP mv_data= mv->getDataNonConst(0); + ArrayRCP fc_data= fc_splitting->getData(0); + + for(LO i=0; i<(LO)fc_data.size(); i++) + mv_data[i] = Teuchos::as(fc_data[i]); + Xpetra::IO::Write(out_fc,*mv); + + + } +#endif + + + /* Generate reindexing arrays */ + // Note: cpoint2pcol is ghosted if myPointType is + // NOTE: Since the ghosted coarse column map follows the ordering of + // the fine column map, this *should* work, because it is in local indices. + // FIXME: Add a check for this in debug mode. + Array cpoint2pcol(myPointType.size(),LO_INVALID); + Array pcol2cpoint(coarseMap->getNodeNumElements(),LO_INVALID); + LO num_c_points = 0; + LO num_f_points =0; + for(LO i=0; i<(LO) myPointType.size(); i++) { + if(myPointType[i] == C_PT) { + cpoint2pcol[i] = num_c_points; + num_c_points++; + } + else if (myPointType[i] == F_PT) + num_f_points++; + } + for(LO i=0; i<(LO)cpoint2pcol.size(); i++) { + if(cpoint2pcol[i] != LO_INVALID) + pcol2cpoint[cpoint2pcol[i]] =i; + } + + // Generate edge strength flags (this will make everything easier later) + // These do *not* need to be ghosted (unlike A) + Teuchos::Array eis_rowptr; + Teuchos::Array edgeIsStrong; + { + SubFactoryMonitor sfm(*this,"Strength Flags",coarseLevel); + GenerateStrengthFlags(*A,*graph,eis_rowptr,edgeIsStrong); + } + + // Phase 3: Generate the P matrix + RCP coarseColMap = coarseMap; + RCP coarseDomainMap = ownedCoarseMap; + if(scheme == "ext+i") { + SubFactoryMonitor sfm(*this,"Ext+i Interpolation",coarseLevel); + Coarsen_Ext_Plus_I(*A,Aghost,*graph,coarseColMap,coarseDomainMap,num_c_points,num_f_points,myPointType(),myPointType_ghost(),cpoint2pcol,pcol2cpoint,eis_rowptr,edgeIsStrong,BlockNumber,P); + } + else if(scheme == "direct") { + SubFactoryMonitor sfm(*this,"Direct Interpolation",coarseLevel); + Coarsen_Direct(*A,Aghost,*graph,coarseColMap,coarseDomainMap,num_c_points,num_f_points,myPointType(),myPointType_ghost(),cpoint2pcol,pcol2cpoint,eis_rowptr,edgeIsStrong,BlockNumber,P); + } + else if(scheme == "classical modified") { + SubFactoryMonitor sfm(*this,"Classical Modified Interpolation",coarseLevel); + Coarsen_ClassicalModified(*A,Aghost,*graph,coarseColMap,coarseDomainMap,num_c_points,num_f_points,myPointType(),myPointType_ghost(),cpoint2pcol,pcol2cpoint,eis_rowptr,edgeIsStrong,BlockNumber,remoteOnlyImporter,P); + } + // NOTE: ParameterList validator will check this guy so we don't really need an "else" here + +#ifdef CMS_DEBUG + Xpetra::IO::Write("classical_p.mat", *P); +#endif + + // Save output + Set(coarseLevel,"P",P); + RCP pg = P->getCrsGraph(); + Set(coarseLevel,"P Graph",pg); + + //RCP coarseNullspace = MultiVectorFactory::Build(coarseMap, fineNullspace->getNumVectors()); + // P->apply(*fineNullspace, *coarseNullspace, Teuchos::TRANS, Teuchos::ScalarTraits::one(), Teuchos::ScalarTraits::zero()); + // Set(coarseLevel, "Nullspace", coarseNullspace); + + if (IsPrint(Statistics1)) { + RCP params = rcp(new ParameterList()); + params->set("printLoadBalancingInfo", true); + GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*P, "P", params); + } + } +/* ************************************************************************* */ +template +void ClassicalPFactory:: +Coarsen_ClassicalModified(const Matrix & A,const RCP & Aghost, const GraphBase & graph, RCP & coarseColMap, RCP & coarseDomainMap, LO num_c_points, LO num_f_points, const Teuchos::ArrayView & myPointType, const Teuchos::ArrayView & myPointType_ghost, const Teuchos::Array & cpoint2pcol, const Teuchos::Array & pcol2cpoint, Teuchos::Array & eis_rowptr, Teuchos::Array & edgeIsStrong, RCP & BlockNumber, RCP remoteOnlyImporter,RCP & P) const { + /* ============================================================= */ + /* Phase 3 : Classical Modified Interpolation */ + /* De Sterck, Falgout, Nolting and Yang. "Distance-two */ + /* interpolation for parallel algebraic multigrid", NLAA 2008 */ + /* 15:115-139 */ + /* ============================================================= */ + /* Definitions: */ + /* F = F-points */ + /* C = C-points */ + /* N_i = non-zero neighbors of node i */ + /* S_i = {j\in N_i | j strongly influences i } [strong neighbors of i] */ + /* F_i^s = F \cap S_i [strong F-neighbors of i] */ + /* C_i^s = C \cap S_i [strong C-neighbors of i] */ + + /* N_i^w = N_i\ (F_i^s \cup C_i^s) [weak neighbors of i] */ + /* This guy has a typo. The paper had a \cap instead of \cup */ + /* I would note that this set can contain both F-points and */ + /* C-points. They're just weak neighbors of this guy. */ + /* Note that N_i^w \cup F_i^s \cup C_i^s = N_i by construction */ + + + /* \bar{a}_ij = { 0, if sign(a_ij) == sign(a_ii) */ + /* { a_ij, otherwise */ + + /* F_i^s\star = {k\in N_i | C_i^s \cap C_k^s = \emptyset} */ + /* [set of F-neighbors of i that do not share a strong */ + /* C-neighbor with i] */ + + + /* Rewritten Equation (9) on p. 120 */ + /* \tilde{a}_ii = (a_ij + \sum_{k\in{N_i^w \cup F_i^s\star}} a_ik */ + /* */ + /* f_ij = \sum_{k\in{F_i^s\setminusF_i^s*}} \frac{a_ik \bar{a}_kj}{\sum_{m\inC_i^s \bar{a}_km}} */ + /* */ + /* w_ij = \frac{1}{\tilde{a}_ii} ( a_ij + f_ij) for all j in C_i^s */ + + + TEUCHOS_TEST_FOR_EXCEPTION(1,std::runtime_error,"ClassicalPFactory: ClassicalModified not implemented"); + +} + + +/* ************************************************************************* */ +template +void ClassicalPFactory:: +Coarsen_Direct(const Matrix & A,const RCP & Aghost, const GraphBase & graph, RCP & coarseColMap, RCP & coarseDomainMap, LO num_c_points, LO num_f_points, const Teuchos::ArrayView & myPointType, const Teuchos::ArrayView & myPointType_ghost, const Teuchos::Array & cpoint2pcol, const Teuchos::Array & pcol2cpoint, Teuchos::Array & eis_rowptr, Teuchos::Array & edgeIsStrong, RCP & BlockNumber, RCP & P) const { + /* ============================================================= */ + /* Phase 3 : Direct Interpolation */ + /* We do not use De Sterck, Falgout, Nolting and Yang (2008) */ + /* here. Instead we follow: */ + /* Trottenberg, Oosterlee and Schueller, Multigrid, 2001. */ + /* with some modifications inspirted by PyAMG */ + /* ============================================================= */ + /* Definitions: */ + /* F = F-points */ + /* C = C-points */ + /* N_i = non-zero neighbors of node i */ + /* S_i = {j\in N_i | j strongly influences i } [strong neighbors of i] */ + /* F_i^s = F \cap S_i [strong F-neighbors of i] */ + /* C_i^s = C \cap S_i [strong C-neighbors of i] */ + /* P_i = Set of interpolatory variables for row i [here = C_i^s] */ + + /* (A.2.17) from p. 426 */ + /* a_ij^- = { a_ij, if a_ij < 0 */ + /* { 0, otherwise */ + /* a_ij^+ = { a_ij, if a_ij > 0 */ + /* { 0, otherwise */ + /* P_i^- = P_i \cap {k | a_ij^- != 0 and a_ij^- = a_ij} */ + /* [strong C-neighbors with negative edges] */ + /* P_i^+ = P_i \cap {k | a_ij^+ != 0 and a_ij^+ = a_ij} */ + /* [strong C-neighbors with positive edges] */ + + + /* de Sterck et al., gives us this: */ + /* Rewritten Equation (6) on p. 119 */ + /* w_ij = - a_ji / a_ii \frac{\sum_{k\in N_i} a_ik} {\sum k\inC_i^s} a_ik}, j\in C_i^s */ + + /* Trottenberg et al. (A.7.6) and (A.7.7) on p. 479 gives this: */ + /* alpha_i = \frac{ \sum_{j\in N_i} a_ij^- }{ \sum_{k\in P_i} a_ik^- } */ + /* beta_i = \frac{ \sum_{j\in N_i} a_ij^+ }{ \sum_{k\in P_i} a_ik^+ } */ + /* w_ik = { - alpha_i (a_ik / a_ii), if k\in P_i^- */ + /* { - beta_i (a_ik / a_ii), if k\in P_i^+ */ + /* NOTE: The text says to modify, if P_i^+ is zero but it isn't entirely clear how that */ + /* works. We'll follow the PyAMG implementation in a few important ways. */ + + const point_type C_PT = ClassicalMapFactory::C_PT; + const point_type DIRICHLET_PT = ClassicalMapFactory::DIRICHLET_PT; + + // Initial (estimated) allocation + // NOTE: If we only used Tpetra, then we could use these guys as is, but because Epetra, we can't, so there + // needs to be a copy below. + using STS = typename Teuchos::ScalarTraits; + using MT = typename STS::magnitudeType; + using MTS = typename Teuchos::ScalarTraits; + size_t Nrows = A.getNodeNumRows(); + double c_point_density = (double)num_c_points / (num_c_points+num_f_points); + double mean_strong_neighbors_per_row = (double) graph.GetNodeNumEdges() / graph.GetNodeNumVertices(); + // double mean_neighbors_per_row = (double)A.getNodeNumEntries() / Nrows; + double nnz_per_row_est = c_point_density*mean_strong_neighbors_per_row; + + size_t nnz_est = std::max(Nrows,std::min((size_t)A.getNodeNumEntries(),(size_t)(nnz_per_row_est*Nrows))); + SC SC_ZERO = STS::zero(); + MT MT_ZERO = MTS::zero(); + Array tmp_rowptr(Nrows+1); + Array tmp_colind(nnz_est); + + // Algorithm (count+realloc) + // For each row, i, + // - Count the number of elements in \hat{C}_j, aka [C-neighbors and C-neighbors of strong F-neighbors of i] + size_t ct=0; + for(LO row=0; row < (LO) Nrows; row++) { + size_t row_start = eis_rowptr[row]; + ArrayView indices; + ArrayView vals; + std::set C_hat; + if(myPointType[row] == DIRICHLET_PT) { + // Dirichlet points get ignored completely + } + else if(myPointType[row] == C_PT) { + // C-Points get a single 1 in their row + C_hat.insert(cpoint2pcol[row]); + } + else { + // F-Points have a more complicated interpolation + + // C-neighbors of row + A.getLocalRowView(row, indices, vals); + for(LO j=0; j (size_t)tmp_colind.size()) { + tmp_colind.resize(std::max(ct+(size_t)C_hat.size(),(size_t)2*tmp_colind.size())); + } + + // Copy + std::copy(C_hat.begin(), C_hat.end(),tmp_colind.begin()+ct); + ct+=C_hat.size(); + tmp_rowptr[row+1] = tmp_rowptr[row] + C_hat.size(); + } + // Resize down + tmp_colind.resize(tmp_rowptr[Nrows]); + + // Allocate memory & copy + P = rcp(new CrsMatrixWrap(A.getRowMap(), coarseColMap, 0)); + RCP PCrs = rcp_dynamic_cast(P)->getCrsMatrix(); + ArrayRCP P_rowptr; + ArrayRCP P_colind; + ArrayRCP P_values; + +#ifdef CMS_DEBUG +printf("CMS: Allocating P w/ %d nonzeros\n",(int)tmp_rowptr[Nrows]); +#endif + PCrs->allocateAllValues(tmp_rowptr[Nrows], P_rowptr, P_colind, P_values); + TEUCHOS_TEST_FOR_EXCEPTION(tmp_rowptr.size() !=P_rowptr.size(), Exceptions::RuntimeError,"ClassicalPFactory: Allocation size error (rowptr)"); + TEUCHOS_TEST_FOR_EXCEPTION(tmp_colind.size() !=P_colind.size(), Exceptions::RuntimeError,"ClassicalPFactory: Allocation size error (colind)"); + // FIXME: This can be short-circuited for Tpetra, if we decide we care + for(LO i=0; i<(LO)Nrows+1; i++) + P_rowptr[i] = tmp_rowptr[i]; + for(LO i=0; i<(LO)tmp_rowptr[Nrows]; i++) + P_colind[i] = tmp_colind[i]; + + + // Algorithm (numeric) + for(LO i=0; i < (LO)Nrows; i++) { + if(myPointType[i] == DIRICHLET_PT) { + // Dirichlet points get ignored completely +#ifdef CMS_DEBUG + // DEBUG + printf("** A(%d,:) is a Dirichlet-Point.\n",i); +#endif + } + else if (myPointType[i] == C_PT) { + // C Points get a single 1 in their row + P_values[P_rowptr[i]] = Teuchos::ScalarTraits::one(); +#ifdef CMS_DEBUG + // DEBUG + printf("** A(%d,:) is a C-Point.\n",i); +#endif + } + else { + /* Trottenberg et al. (A.7.6) and (A.7.7) on p. 479 gives this: */ + /* alpha_i = \frac{ \sum_{j\in N_i} a_ij^- }{ \sum_{k\in P_i} a_ik^- } */ + /* beta_i = \frac{ \sum_{j\in N_i} a_ij^+ }{ \sum_{k\in P_i} a_ik^+ } */ + /* w_ik = { - alpha_i (a_ik / a_ii), if k\in P_i^- */ + /* { - beta_i (a_ik / a_ii), if k\in P_i^+ */ + ArrayView A_indices_i, A_incides_k; + ArrayView A_vals_i, A_indices_k; + A.getLocalRowView(i, A_indices_i, A_vals_i); + size_t row_start = eis_rowptr[i]; + + ArrayView P_indices_i = P_colind.view(P_rowptr[i],P_rowptr[i+1] - P_rowptr[i]); + ArrayView P_vals_i = P_values.view(P_rowptr[i],P_rowptr[i+1] - P_rowptr[i]); + +#ifdef CMS_DEBUG + // DEBUG + { + char mylabel[5]="FUCD"; + char sw[3]="ws"; + printf("** A(%d,:) = ",i); + for(LO j=0; j<(LO)A_indices_i.size(); j++){ + printf("%6.4e(%d-%c%c) ",A_vals_i[j],A_indices_i[j],mylabel[1+myPointType[A_indices_i[j]]],sw[(int)edgeIsStrong[row_start+j]]); + } + printf("\n"); + } +#endif + + SC a_ii = SC_ZERO; + SC pos_numerator = SC_ZERO, neg_numerator = SC_ZERO; + SC pos_denominator = SC_ZERO, neg_denominator = SC_ZERO; + // Find the diagonal and compute the sum ratio + for(LO j=0; j<(LO)A_indices_i.size(); j++) { + SC a_ik = A_vals_i[j]; + LO k = A_indices_i[j]; + + // Diagonal + if(i == k) { + a_ii = a_ik; + } + // Only strong C-neighbors are in the denomintor + if(myPointType[k] == C_PT && edgeIsStrong[row_start + j]) { + if(STS::real(a_ik) > MT_ZERO) pos_denominator += a_ik; + else neg_denominator += a_ik; + } + + // All neighbors are in the numerator + // NOTE: As per PyAMG, this does not include the diagonal + if(i != k) { + if(STS::real(a_ik) > MT_ZERO) pos_numerator += a_ik; + else neg_numerator += a_ik; + } + } + SC alpha = (neg_denominator == MT_ZERO) ? SC_ZERO : (neg_numerator / neg_denominator); + SC beta = (pos_denominator == MT_ZERO) ? SC_ZERO : (pos_numerator / pos_denominator); + alpha /= -a_ii; + beta /= -a_ii; + + // Loop over the entries + for(LO p_j=0; p_j<(LO)P_indices_i.size(); p_j++){ + LO P_col = pcol2cpoint[P_indices_i[p_j]]; + SC a_ij = SC_ZERO; + + // Find A_ij (if it is there) + // FIXME: We can optimize this if we assume sorting + for(LO a_j =0; a_j<(LO)A_indices_i.size(); a_j++) { + if(A_indices_i[a_j] == P_col) { + a_ij = A_vals_i[a_j]; + break; + } + } + SC w_ij = (STS::real(a_ij) < 0 ) ? (alpha * a_ij) : (beta * a_ij); +#ifdef CMS_DEBUG + SC alpha_or_beta = (STS::real(a_ij) < 0 ) ? alpha : beta; + printf("P(%d,%d/%d) = - %6.4e * %6.4e = %6.4e\n",i,P_indices_i[p_j],pcol2cpoint[P_indices_i[p_j]],alpha_or_beta,a_ij,w_ij); +#endif + P_vals_i[p_j] = w_ij; + }//end for A_indices_i + }//end else C_PT + }//end for Numrows + + // Finish up + PCrs->setAllValues(P_rowptr, P_colind, P_values); + PCrs->expertStaticFillComplete(/*domain*/coarseDomainMap, /*range*/A.getDomainMap()); +} + + +/* ************************************************************************* */ +template +void ClassicalPFactory:: +Coarsen_Ext_Plus_I(const Matrix & A,const RCP & Aghost, const GraphBase & graph, RCP & coarseColMap, RCP & coarseDomainMap, LO num_c_points, LO num_f_points, const Teuchos::ArrayView & myPointType, const Teuchos::ArrayView & myPointType_ghost, const Teuchos::Array & cpoint2pcol, const Teuchos::Array & pcol2cpoint, Teuchos::Array & eis_rowptr, Teuchos::Array & edgeIsStrong, RCP & BlockNumber, RCP & P) const { + + /* ============================================================= */ + /* Phase 3 : Extended+i Interpolation */ + /* De Sterck, Falgout, Nolting and Yang. "Distance-two */ + /* interpolation for parallel algebraic multigrid", NLAA 2008 */ + /* 15:115-139 */ + /* ============================================================= */ + /* Definitions: */ + /* F = F-points */ + /* C = C-points */ + /* N_i = non-zero neighbors of node i */ + /* S_i = {j\in N_i | j strongly influences i } [strong neighbors of i] */ + /* F_i^s = F \cap S_i [strong F-neighbors of i] */ + /* C_i^s = C \cap S_i [strong C-neighbors of i] */ + /* N_i^w = N_i\ (F_i^s \cup C_i^s) [weak neighbors of i] */ + /* This guy has a typo. The paper had a \cap instead of \cup */ + /* I would note that this set can contain both F-points and */ + /* C-points. They're just weak neighbors of this guy. */ + /* Note that N_i^w \cup F_i^s \cup C_i^s = N_i by construction */ + + /* \hat{C}_i = C_i \cup (\bigcup_{j\inF_i^s} C_j) */ + /* [C-neighbors and C-neighbors of strong F-neighbors of i] */ + /* */ + + /* \bar{a}_ij = { 0, if sign(a_ij) == sign(a_ii) */ + /* { a_ij, otherwise */ + + + /* Rewritten Equation (19) on p. 123 */ + /* f_ik = \frac{\bar{a}_kj}{\sum{l\in \hat{C}_i\cup {i}} \bar{a}_kl */ + /* w_ij = -\tilde{a}_ii^{-1} (a_ij + \sum_{k\inF_i^s} a_ik f_ik */ + /* for j in \hat{C}_i */ + + /* Rewritten Equation (20) on p. 124 [for the lumped diagonal] */ + /* g_ik = \frac{\bar{a}_ki}{\sum{l\in \hat{C}_i\cup {i}} \bar{a}_kl */ + /* \tilde{a}_ii = a_ii + \sum_{n\inN_i^w\setminus \hat{C}_i} a_in + \sum_{k\inF_i^s} a_ik g_ik */ + TEUCHOS_TEST_FOR_EXCEPTION(1,std::runtime_error,"ClassicalPFactory: Ext+i not implemented"); + +} + + + + +/* ************************************************************************* */ +template +void ClassicalPFactory:: +GenerateStrengthFlags(const Matrix & A,const GraphBase & graph, Teuchos::Array & eis_rowptr,Teuchos::Array & edgeIsStrong) const { + // To make this easier, we'll create a bool array equal to the nnz in the matrix + // so we know whether each edge is strong or not. This will save us a bunch of + // trying to match the graph and matrix later + size_t Nrows = A.getNodeNumRows(); + eis_rowptr.resize(Nrows+1); + + if(edgeIsStrong.size() == 0) { + // Preferred + edgeIsStrong.resize(A.getNodeNumEntries(),false); + } + else { + edgeIsStrong.resize(A.getNodeNumEntries(),false); + edgeIsStrong.assign(A.getNodeNumEntries(),false); + } + + eis_rowptr[0] = 0; + for (LO i=0; i<(LO)Nrows; i++) { + LO rowstart = eis_rowptr[i]; + ArrayView A_indices; + ArrayView A_values; + A.getLocalRowView(i, A_indices, A_values); + LO A_size = (LO) A_indices.size(); + + ArrayView G_indices = graph.getNeighborVertices(i); + LO G_size = (LO) G_indices.size(); + + // Both of these guys should be in the same (sorted) order, but let's check + bool is_ok=true; + for(LO j=0; j A_indices[j+1]) { is_ok=false; break;} + for(LO j=0; j G_indices[j+1]) { is_ok=false; break;} + TEUCHOS_TEST_FOR_EXCEPTION(!is_ok, Exceptions::RuntimeError,"ClassicalPFactory: Exected A and Graph to be sorted"); + + // Now cycle through and set the flags - if the edge is in G it is strong + for(LO g_idx=0, a_idx=0; g_idx < G_size; g_idx++) { + LO col = G_indices[g_idx]; + while (A_indices[a_idx] != col && a_idx < A_size) a_idx++; + if(a_idx == A_size) {is_ok=false;break;} + edgeIsStrong[rowstart+a_idx] = true; + } + + eis_rowptr[i+1] = eis_rowptr[i] + A_size; + } +} + + +/* ************************************************************************* */ +template +void ClassicalPFactory:: +GhostCoarseMap(const Matrix &A, const Import & Importer, const ArrayRCP myPointType, const RCP & coarseMap, RCP & coarseColMap) const { + const point_type C_PT = ClassicalMapFactory::C_PT; + const GO GO_INVALID = Teuchos::OrdinalTraits::invalid(); + RCP d_coarseIds = GlobalOrdinalVectorFactory::Build(A.getRowMap()); + ArrayRCP d_data = d_coarseIds->getDataNonConst(0); + LO ct=0; + + for(LO i=0; i<(LO)d_data.size(); i++) { + if(myPointType[i] == C_PT) { + d_data[i] = coarseMap->getGlobalElement(ct); + ct++; + } + else + d_data[i] = GO_INVALID; + } + + // Ghost this guy + RCP c_coarseIds = GlobalOrdinalVectorFactory::Build(A.getColMap()); + c_coarseIds->doImport(*d_coarseIds,Importer,Xpetra::INSERT); + + // If we assume that A is in Aztec ordering, then any subset of A's unknowns will + // be in Aztec ordering as well, which means we can just condense these guys down + // Overallocate, count and view + ArrayRCP c_data = c_coarseIds->getDataNonConst(0); + + Array c_gids(c_data.size()); + LO count=0; + + for(LO i=0; i<(LO)c_data.size(); i++) { + if(c_data[i] != GO_INVALID) { + c_gids[count] = c_data[i]; + count++; + } + } + // FIXME: Assumes scalar PDE + std::vector stridingInfo_(1); + stridingInfo_[0]=1; + GO domainGIDOffset = 0; + + coarseColMap = StridedMapFactory::Build(coarseMap->lib(), + Teuchos::OrdinalTraits::invalid(), + c_gids.view(0,count), + coarseMap->getIndexBase(), + stridingInfo_, + coarseMap->getComm(), + domainGIDOffset); + +} + + +} //namespace MueLu + + + +#define MUELU_CLASSICALPFACTORY_SHORT +#endif // MUELU_CLASSICALPFACTORY_DEF_HPP + + diff --git a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_def.hpp b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_def.hpp index e714b5b5fb4f..f35adec7a0bd 100644 --- a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_def.hpp +++ b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_def.hpp @@ -63,12 +63,11 @@ namespace MueLu { RCP validParamList = rcp(new ParameterList()); #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name)) - SET_VALID_ENTRY("interp: interpolation order"); SET_VALID_ENTRY("interp: build coarse coordinates"); #undef SET_VALID_ENTRY // general variables needed in GeometricInterpolationPFactory - validParamList->set >("A", Teuchos::null, + validParamList->set >("A", Teuchos::null, "Generating factory of the matrix A"); validParamList->set >("Aggregates", Teuchos::null, "Aggregates generated by StructuredAggregationFactory used to construct a piece-constant prolongator."); @@ -85,7 +84,9 @@ namespace MueLu { validParamList->set >("numDimensions", Teuchos::null, "Number of spacial dimensions in the problem."); validParamList->set >("lCoarseNodesPerDim", Teuchos::null, - "Number of nodes per spatial dimension on the coarse grid."); + "Number of nodes per spatial dimension on the coarse grid."); + validParamList->set >("structuredInterpolationOrder", Teuchos::null, + "Interpolation order for constructing the prolongator."); validParamList->set ("keep coarse coords", false, "Flag to keep coordinates for special coarse grid solve"); validParamList->set ("interp: remove small entries", true, "Remove small interpolation coeficient from prolongator to reduce fill-in on coarse level"); @@ -102,9 +103,10 @@ namespace MueLu { Input(fineLevel, "numDimensions"); Input(fineLevel, "prolongatorGraph"); Input(fineLevel, "lCoarseNodesPerDim"); + Input(fineLevel, "structuredInterpolationOrder"); if( pL.get("interp: build coarse coordinates") || - (pL.get("interp: interpolation order") == 1) ) { + Get(fineLevel, "structuredInterpolationOrder") == 1) { Input(fineLevel, "Coordinates"); Input(fineLevel, "coarseCoordinatesFineMap"); Input(fineLevel, "coarseCoordinatesMap"); @@ -138,7 +140,7 @@ namespace MueLu { const ParameterList& pL = GetParameterList(); const bool removeSmallEntries = pL.get("interp: remove small entries"); const bool buildCoarseCoordinates = pL.get("interp: build coarse coordinates"); - const int interpolationOrder = pL.get ("interp: interpolation order"); + const int interpolationOrder = Get(fineLevel, "structuredInterpolationOrder"); const int numDimensions = Get(fineLevel, "numDimensions"); // Declared main input/outputs to be retrieved and placed on the fine resp. coarse level diff --git a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp index 5c6e62280938..4730b9ee619a 100644 --- a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp +++ b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp @@ -63,7 +63,6 @@ namespace MueLu { RCP validParamList = rcp(new ParameterList()); #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name)) - SET_VALID_ENTRY("interp: interpolation order"); SET_VALID_ENTRY("interp: build coarse coordinates"); #undef SET_VALID_ENTRY @@ -82,6 +81,8 @@ namespace MueLu { "Number of nodes per spatial dimension on the coarse grid."); validParamList->set >("indexManager", Teuchos::null, "The index manager associated with the local mesh."); + validParamList->set >("structuredInterpolationOrder", Teuchos::null, + "Interpolation order for constructing the prolongator."); return validParamList; } @@ -96,9 +97,10 @@ namespace MueLu { Input(fineLevel, "numDimensions"); Input(fineLevel, "prolongatorGraph"); Input(fineLevel, "lCoarseNodesPerDim"); + Input(fineLevel, "structuredInterpolationOrder"); if( pL.get("interp: build coarse coordinates") || - (pL.get("interp: interpolation order") == 1) ) { + Get(fineLevel, "structuredInterpolationOrder") == 1) { Input(fineLevel, "Coordinates"); Input(fineLevel, "indexManager"); } @@ -130,7 +132,7 @@ namespace MueLu { // Get inputs from the parameter list const ParameterList& pL = GetParameterList(); const bool buildCoarseCoordinates = pL.get("interp: build coarse coordinates"); - const int interpolationOrder = pL.get ("interp: interpolation order"); + const int interpolationOrder = Get(fineLevel, "structuredInterpolationOrder"); const int numDimensions = Get(fineLevel, "numDimensions"); // Declared main input/outputs to be retrieved and placed on the fine resp. coarse level diff --git a/packages/muelu/src/Transfers/PCoarsen/MueLu_IntrepidPCoarsenFactory_def.hpp b/packages/muelu/src/Transfers/PCoarsen/MueLu_IntrepidPCoarsenFactory_def.hpp index 06c548504b52..1154089e7347 100644 --- a/packages/muelu/src/Transfers/PCoarsen/MueLu_IntrepidPCoarsenFactory_def.hpp +++ b/packages/muelu/src/Transfers/PCoarsen/MueLu_IntrepidPCoarsenFactory_def.hpp @@ -104,7 +104,7 @@ namespace MueLu { namespace MueLuIntrepid { inline std::string tolower(const std::string & str) { std::string data(str); - std::transform(data.begin(), data.end(), data.begin(), [](unsigned char c) { return std::tolower(c); }); + std::transform(data.begin(), data.end(), data.begin(), ::tolower); return data; } diff --git a/packages/muelu/src/Utils/ClassList/SC-LO-GO-NO.classList b/packages/muelu/src/Utils/ClassList/SC-LO-GO-NO.classList index f0ab4c0887b3..826d4c8934b5 100644 --- a/packages/muelu/src/Utils/ClassList/SC-LO-GO-NO.classList +++ b/packages/muelu/src/Utils/ClassList/SC-LO-GO-NO.classList @@ -17,6 +17,8 @@ BlockedRAPFactory BrickAggregationFactory BraessSarazinSmoother CGSolver +ClassicalMapFactory +ClassicalPFactory CloneRepartitionInterface CoalesceDropFactory CoalesceDropFactory_kokkos - #if defined(HAVE_MUELU_KOKKOS_REFACTOR) diff --git a/packages/muelu/src/Utils/ExplicitInstantiation/ETI_SC_LO_GO_NO_classes.cmake b/packages/muelu/src/Utils/ExplicitInstantiation/ETI_SC_LO_GO_NO_classes.cmake index 859a82d2f05a..aaca3f4f6077 100644 --- a/packages/muelu/src/Utils/ExplicitInstantiation/ETI_SC_LO_GO_NO_classes.cmake +++ b/packages/muelu/src/Utils/ExplicitInstantiation/ETI_SC_LO_GO_NO_classes.cmake @@ -18,6 +18,8 @@ APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::BlockedRAPFactory ) APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::BrickAggregationFactory ) APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::BraessSarazinSmoother ) APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::CGSolver ) +APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::ClassicalMapFactory ) +APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::ClassicalPFactory ) APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::CloneRepartitionInterface ) APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::CoalesceDropFactory ) APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::CoalesceDropFactory_kokkos-.?if.defined[HAVE_MUELU_KOKKOS_REFACTOR] ) diff --git a/packages/muelu/src/Utils/ForwardDeclaration/MueLu_ClassicalMapFactory_fwd.hpp b/packages/muelu/src/Utils/ForwardDeclaration/MueLu_ClassicalMapFactory_fwd.hpp new file mode 100644 index 000000000000..7e675f6a44bb --- /dev/null +++ b/packages/muelu/src/Utils/ForwardDeclaration/MueLu_ClassicalMapFactory_fwd.hpp @@ -0,0 +1,63 @@ +// @HEADER +// +// *********************************************************************** +// +// MueLu: A package for multigrid based preconditioning +// Copyright 2012 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact +// Jonathan Hu (jhu@sandia.gov) +// Andrey Prokopenko (aprokop@sandia.gov) +// Ray Tuminaro (rstumin@sandia.gov) +// +// *********************************************************************** +// +// @HEADER +#ifndef MUELU_CLASSICALMAPFACTORY_FWD_HPP +#define MUELU_CLASSICALMAPFACTORY_FWD_HPP + + + + +namespace MueLu { + template + class ClassicalMapFactory; +} + +#ifndef MUELU_CLASSICALMAPFACTORY_SHORT +#define MUELU_CLASSICALMAPFACTORY_SHORT +#endif + + + +#endif // MUELU_CLASSICALMAPFACTORY_FWD_HPP diff --git a/packages/muelu/src/Utils/ForwardDeclaration/MueLu_ClassicalPFactory_fwd.hpp b/packages/muelu/src/Utils/ForwardDeclaration/MueLu_ClassicalPFactory_fwd.hpp new file mode 100644 index 000000000000..8e3c7f7a5dcd --- /dev/null +++ b/packages/muelu/src/Utils/ForwardDeclaration/MueLu_ClassicalPFactory_fwd.hpp @@ -0,0 +1,63 @@ +// @HEADER +// +// *********************************************************************** +// +// MueLu: A package for multigrid based preconditioning +// Copyright 2012 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact +// Jonathan Hu (jhu@sandia.gov) +// Andrey Prokopenko (aprokop@sandia.gov) +// Ray Tuminaro (rstumin@sandia.gov) +// +// *********************************************************************** +// +// @HEADER +#ifndef MUELU_CLASSICALPFACTORY_FWD_HPP +#define MUELU_CLASSICALPFACTORY_FWD_HPP + + + + +namespace MueLu { + template + class ClassicalPFactory; +} + +#ifndef MUELU_CLASSICALPFACTORY_SHORT +#define MUELU_CLASSICALPFACTORY_SHORT +#endif + + + +#endif // MUELU_CLASSICALPFACTORY_FWD_HPP diff --git a/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp b/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp index 826f2b10428f..45c497b8fd85 100644 --- a/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp +++ b/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp @@ -261,6 +261,32 @@ tol = 0.; return diag; } + /*! @brief Return vector containing: max_{i\not=k}(-a_ik), for each for i in the matrix + * + * @param[in] A: input matrix + * @ret: vector containing max_{i\not=k}(-a_ik) + */ + + static Teuchos::ArrayRCP GetMatrixMaxMinusOffDiagonal(const Xpetra::Matrix& A) { + size_t numRows = A.getRowMap()->getNodeNumElements(); + Magnitude ZERO = Teuchos::ScalarTraits::zero(); + Teuchos::ArrayRCP maxvec(numRows); + Teuchos::ArrayView cols; + Teuchos::ArrayView vals; + for (size_t i = 0; i < numRows; ++i) { + A.getLocalRowView(i, cols, vals); + Magnitude mymax = ZERO; + for (LocalOrdinal j=0; j < cols.size(); ++j) { + if (Teuchos::as(cols[j]) != i) { + mymax = std::max(mymax,-Teuchos::ScalarTraits::real(vals[j])); + } + } + maxvec[i] = mymax; + } + return maxvec; + } + + /*! @brief Return vector containing inverse of input vector * * @param[in] v: input vector @@ -721,6 +747,38 @@ tol = 0.; return boundaryNodes; } + /*! @brief Apply Rowsum Criterion + + Flags a row i as dirichlet if: + + \sum_{j\not=i} A_ij > A_ii * tol + + @param[in] A matrix + @param[in] rowSumTol See above + @param[in/out] dirichletRows boolean array. The ith entry is true if the above criterion is satisfied (or if it was already set to true) + + */ + static void ApplyRowSumCriterion(const Xpetra::Matrix& A, const Magnitude rowSumTol, Teuchos::ArrayRCP& dirichletRows) { + typedef Teuchos::ScalarTraits STS; + RCP> rowmap = A.getRowMap(); + for (LocalOrdinal row = 0; row < Teuchos::as(rowmap->getNodeNumElements()); ++row) { + size_t nnz = A.getNumEntriesInLocalRow(row); + ArrayView indices; + ArrayView vals; + A.getLocalRowView(row, indices, vals); + + Scalar rowsum = STS::zero(); + Scalar diagval = STS::zero(); + for (LocalOrdinal colID = 0; colID < Teuchos::as(nnz); colID++) { + LocalOrdinal col = indices[colID]; + if (row == col) + diagval = vals[colID]; + rowsum += vals[colID]; + } + if (STS::real(rowsum) > STS::magnitude(diagval) * rowSumTol) + dirichletRows[row] = true; + } + } /*! @brief Detect Dirichlet columns based on Dirichlet rows diff --git a/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp b/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp index 2e43ade55857..8f52cadf9bee 100644 --- a/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp +++ b/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp @@ -211,6 +211,8 @@ namespace MueLu { static Teuchos::RCP > GetLumpedMatrixDiagonal(Xpetra::Matrix const &A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits::zero(), const bool replaceSingleEntryRowWithZero = false) { return MueLu::UtilitiesBase::GetLumpedMatrixDiagonal(A, doReciprocal, tol, tolReplacement, replaceSingleEntryRowWithZero); } static RCP > GetMatrixOverlappedDiagonal(const Xpetra::Matrix& A) { return MueLu::UtilitiesBase::GetMatrixOverlappedDiagonal(A); } + static Teuchos::ArrayRCP GetMatrixMaxMinusOffDiagonal(const Xpetra::Matrix& A) { return MueLu::UtilitiesBase::GetMatrixMaxMinusOffDiagonal(A); } + static Teuchos::RCP > GetInverse(Teuchos::RCP > v, Magnitude tol = Teuchos::ScalarTraits::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits::zero()) { return MueLu::UtilitiesBase::GetInverse(v,tol,tolReplacement); } static Teuchos::Array ResidualNorm(const Xpetra::Operator& Op, const Xpetra::MultiVector& X, const Xpetra::MultiVector& RHS) { return MueLu::UtilitiesBase::ResidualNorm(Op,X,RHS); } static Teuchos::Array ResidualNorm(const Xpetra::Operator& Op, const Xpetra::MultiVector& X, const Xpetra::MultiVector& RHS, Xpetra::MultiVector& Resid) { return MueLu::UtilitiesBase::ResidualNorm(Op,X,RHS,Resid); } @@ -222,6 +224,7 @@ namespace MueLu { static typename Teuchos::ScalarTraits::magnitudeType Distance2(const Teuchos::ArrayView & weight,const Teuchos::Array>& v, LocalOrdinal i0, LocalOrdinal i1) { return MueLu::UtilitiesBase::Distance2(weight,v,i0,i1); } static Teuchos::ArrayRCP DetectDirichletRows(const Xpetra::Matrix& A, const Magnitude& tol = Teuchos::ScalarTraits::magnitude(0.), const bool count_twos_as_dirichlet=false) { return MueLu::UtilitiesBase::DetectDirichletRows(A,tol,count_twos_as_dirichlet); } static Teuchos::ArrayRCP DetectDirichletRowsExt(const Xpetra::Matrix& A, bool & bHasZeroDiagonal, const Magnitude& tol = Teuchos::ScalarTraits::zero()) { return MueLu::UtilitiesBase::DetectDirichletRowsExt(A,bHasZeroDiagonal,tol); } + static void ApplyRowSumCriterion(const Xpetra::Matrix& A, const Magnitude rowSumTol, Teuchos::ArrayRCP& dirichletRows) {return MueLu::UtilitiesBase::ApplyRowSumCriterion(A,rowSumTol,dirichletRows); } static void SetRandomSeed(const Teuchos::Comm &comm) { MueLu::UtilitiesBase::SetRandomSeed(comm); } @@ -587,6 +590,7 @@ namespace MueLu { static Teuchos::RCP > GetLumpedMatrixDiagonal(Xpetra::Matrix const &A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits::zero(), const bool replaceSingleEntryRowWithZero = false) { return MueLu::UtilitiesBase::GetLumpedMatrixDiagonal(A, doReciprocal, tol, tolReplacement, replaceSingleEntryRowWithZero); } static RCP GetMatrixOverlappedDiagonal(const Matrix& A) { return MueLu::UtilitiesBase::GetMatrixOverlappedDiagonal(A); } + static Teuchos::ArrayRCP GetMatrixMaxMinusOffDiagonal(const Xpetra::Matrix& A) { return MueLu::UtilitiesBase::GetMatrixMaxMinusOffDiagonal(A); } static RCP GetInverse(Teuchos::RCP v, Magnitude tol = Teuchos::ScalarTraits::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits::zero()) { return MueLu::UtilitiesBase::GetInverse(v,tol,tolReplacement); } static Teuchos::Array ResidualNorm(const Xpetra::Operator& Op, const Xpetra::MultiVector& X, const Xpetra::MultiVector& RHS) { return MueLu::UtilitiesBase::ResidualNorm(Op,X,RHS); } static Teuchos::Array ResidualNorm(const Xpetra::Operator& Op, const Xpetra::MultiVector& X, const Xpetra::MultiVector& RHS, Xpetra::MultiVector& Resid) { return MueLu::UtilitiesBase::ResidualNorm(Op,X,RHS,Resid); } @@ -598,6 +602,7 @@ namespace MueLu { static Teuchos::ScalarTraits::magnitudeType Distance2(const Teuchos::ArrayView &weight, const Teuchos::Array>& v, LocalOrdinal i0, LocalOrdinal i1) { return MueLu::UtilitiesBase::Distance2(weight,v,i0,i1); } static Teuchos::ArrayRCP DetectDirichletRows(const Matrix& A, const Magnitude& tol = Teuchos::ScalarTraits::zero(), const bool count_twos_as_dirichlet=false) { return MueLu::UtilitiesBase::DetectDirichletRows(A,tol,count_twos_as_dirichlet); } static Teuchos::ArrayRCP DetectDirichletRowsExt(const Matrix& A, bool & bHasZeroDiagonal, const Magnitude& tol = Teuchos::ScalarTraits::zero()) { return MueLu::UtilitiesBase::DetectDirichletRowsExt(A,bHasZeroDiagonal,tol); } + static void ApplyRowSumCriterion(const Xpetra::Matrix& A, const Magnitude rowSumTol, Teuchos::ArrayRCP& dirichletRows) {return MueLu::UtilitiesBase::ApplyRowSumCriterion(A,rowSumTol,dirichletRows); } static void SetRandomSeed(const Teuchos::Comm &comm) { MueLu::UtilitiesBase::SetRandomSeed(comm); } static Scalar PowerMethod(const Matrix& A, bool scaleByDiag = true, diff --git a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_decl.hpp b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_decl.hpp index b63abfcdf147..9885e1b9c99e 100644 --- a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_decl.hpp +++ b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_decl.hpp @@ -271,6 +271,10 @@ namespace MueLu { static void ZeroDirichletRows(RCP& X, const Kokkos::View& dirichletRows, SC replaceWith=Teuchos::ScalarTraits::zero()); static void ZeroDirichletCols(RCP& A, const Kokkos::View& dirichletCols, SC replaceWith=Teuchos::ScalarTraits::zero()); + + static void ApplyRowSumCriterion(const Matrix& A, + const typename Teuchos::ScalarTraits::magnitudeType rowSumTol, + Kokkos::View & dirichletRows); static RCP RealValuedToScalarMultiVector(RCP X); @@ -420,6 +424,10 @@ namespace MueLu { static void ZeroDirichletCols(RCP& A, const Kokkos::View& dirichletCols, SC replaceWith=Teuchos::ScalarTraits::zero()); + static void ApplyRowSumCriterion(const Matrix& A, + const typename Teuchos::ScalarTraits::magnitudeType rowSumTol, + Kokkos::View & dirichletRows); + static RCP RealValuedToScalarMultiVector(RCP X); static Scalar PowerMethod(const Matrix& A, bool scaleByDiag = true, LO niters = 10, Magnitude tolerance = 1e-2, bool verbose = false, unsigned int seed = 123) { diff --git a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp index 9014eb44683b..cc7b3e880312 100644 --- a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp +++ b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp @@ -620,6 +620,54 @@ namespace MueLu { return MueLu::ZeroDirichletCols(A, dirichletCols, replaceWith); } + // Applies rowsum criterion + template + void ApplyRowSumCriterion(const Xpetra::Matrix& A, + const typename Teuchos::ScalarTraits::magnitudeType rowSumTol, + Kokkos::View & dirichletRows) + { + typedef Teuchos::ScalarTraits STS; + RCP> rowmap = A.getRowMap(); + for (LocalOrdinal row = 0; row < Teuchos::as(rowmap->getNodeNumElements()); ++row) { + size_t nnz = A.getNumEntriesInLocalRow(row); + ArrayView indices; + ArrayView vals; + A.getLocalRowView(row, indices, vals); + + Scalar rowsum = STS::zero(); + Scalar diagval = STS::zero(); + for (LocalOrdinal colID = 0; colID < Teuchos::as(nnz); colID++) { + LocalOrdinal col = indices[colID]; + if (row == col) + diagval = vals[colID]; + rowsum += vals[colID]; + } + if (STS::real(rowsum) > STS::magnitude(diagval) * rowSumTol) + dirichletRows(row) = true; + } + } + + template + void + Utilities_kokkos:: + ApplyRowSumCriterion(const Xpetra::Matrix& A, + const typename Teuchos::ScalarTraits::magnitudeType rowSumTol, + Kokkos::View & dirichletRows) + { + MueLu::ApplyRowSumCriterion(A,rowSumTol,dirichletRows); + } + + + template + void + Utilities_kokkos:: + ApplyRowSumCriterion(const Xpetra::Matrix& A, + const typename Teuchos::ScalarTraits::magnitudeType rowSumTol, + Kokkos::View & dirichletRows) + { + MueLu::ApplyRowSumCriterion(A,rowSumTol,dirichletRows); + } + template RCP > diff --git a/packages/muelu/test/scaling/CMakeLists.txt b/packages/muelu/test/scaling/CMakeLists.txt index c12f096e6241..a91a7771e2d3 100644 --- a/packages/muelu/test/scaling/CMakeLists.txt +++ b/packages/muelu/test/scaling/CMakeLists.txt @@ -69,13 +69,13 @@ IF (${PACKAGE_NAME}_HAVE_TPETRA_SOLVER_STACK OR ${PACKAGE_NAME}_HAVE_EPETRA_SOLV TRIBITS_ADD_EXECUTABLE( ImportPerformance - SOURCES ImportPerformance + SOURCES ImportPerformance.cpp COMM mpi ) TRIBITS_ADD_EXECUTABLE( TAFCPerformance - SOURCES TAFCPerformance + SOURCES TAFCPerformance.cpp COMM mpi ) diff --git a/packages/muelu/test/structured/structured_1dof.xml b/packages/muelu/test/structured/structured_1dof.xml index 5623b7cecbca..f052b0d282ee 100644 --- a/packages/muelu/test/structured/structured_1dof.xml +++ b/packages/muelu/test/structured/structured_1dof.xml @@ -37,7 +37,7 @@ - + diff --git a/packages/muelu/test/structured/structured_1dof_kokkos.xml b/packages/muelu/test/structured/structured_1dof_kokkos.xml index 1939c9f66aa7..1bdb6908514b 100644 --- a/packages/muelu/test/structured/structured_1dof_kokkos.xml +++ b/packages/muelu/test/structured/structured_1dof_kokkos.xml @@ -32,7 +32,7 @@ - + diff --git a/packages/muelu/test/structured/structured_1dof_shift.xml b/packages/muelu/test/structured/structured_1dof_shift.xml index 96340b3e7189..b40b5e855985 100644 --- a/packages/muelu/test/structured/structured_1dof_shift.xml +++ b/packages/muelu/test/structured/structured_1dof_shift.xml @@ -37,7 +37,7 @@ - + diff --git a/packages/muelu/test/structured/structured_2dof.xml b/packages/muelu/test/structured/structured_2dof.xml index 416bb4e46098..d7ca998d7c83 100644 --- a/packages/muelu/test/structured/structured_2dof.xml +++ b/packages/muelu/test/structured/structured_2dof.xml @@ -37,8 +37,8 @@ - - + + diff --git a/packages/muelu/test/structured/structured_3dof.xml b/packages/muelu/test/structured/structured_3dof.xml index b7b5d7a9398d..0f25311713f7 100644 --- a/packages/muelu/test/structured/structured_3dof.xml +++ b/packages/muelu/test/structured/structured_3dof.xml @@ -38,7 +38,7 @@ - + diff --git a/packages/muelu/test/structured/structured_interp_kokkos.xml b/packages/muelu/test/structured/structured_interp_kokkos.xml index 94617de97252..04376dbafb5c 100644 --- a/packages/muelu/test/structured/structured_interp_kokkos.xml +++ b/packages/muelu/test/structured/structured_interp_kokkos.xml @@ -38,7 +38,7 @@ - + diff --git a/packages/muelu/test/structured/structured_interp_sa_kokkos.xml b/packages/muelu/test/structured/structured_interp_sa_kokkos.xml index 6f606856826f..5998a42599fa 100644 --- a/packages/muelu/test/structured/structured_interp_sa_kokkos.xml +++ b/packages/muelu/test/structured/structured_interp_sa_kokkos.xml @@ -38,7 +38,7 @@ - + diff --git a/packages/muelu/test/structured/structured_scp_1dof.xml b/packages/muelu/test/structured/structured_scp_1dof.xml index fe66d3227288..4a111b9ff6c5 100644 --- a/packages/muelu/test/structured/structured_scp_1dof.xml +++ b/packages/muelu/test/structured/structured_scp_1dof.xml @@ -38,7 +38,7 @@ - + diff --git a/packages/muelu/test/structured/structured_sparc_1dof.xml b/packages/muelu/test/structured/structured_sparc_1dof.xml index 175bf5998345..0d7c126aaa2b 100644 --- a/packages/muelu/test/structured/structured_sparc_1dof.xml +++ b/packages/muelu/test/structured/structured_sparc_1dof.xml @@ -38,7 +38,7 @@ - + diff --git a/packages/muelu/test/unit_tests/CMakeLists.txt b/packages/muelu/test/unit_tests/CMakeLists.txt index 53d15dfcfd45..ecd053ff220e 100644 --- a/packages/muelu/test/unit_tests/CMakeLists.txt +++ b/packages/muelu/test/unit_tests/CMakeLists.txt @@ -29,6 +29,7 @@ APPEND_SET(SOURCES BlackBoxPFactory.cpp CoalesceDropFactory.cpp CoarseMapFactory.cpp + ClassicalPFactory.cpp # CoupledAggregationFactory.cpp FineLevelInputDataFactory.cpp GeneralGeometricPFactory.cpp @@ -57,7 +58,7 @@ APPEND_SET(SOURCES TransPFactory.cpp UnsmooshFactory.cpp UserData/CreateXpetraPreconditioner.cpp - Utilities + Utilities.cpp VariableContainer.cpp VariableDofLaplacianFactory.cpp ) diff --git a/packages/muelu/test/unit_tests/ClassicalPFactory.cpp b/packages/muelu/test/unit_tests/ClassicalPFactory.cpp new file mode 100644 index 000000000000..8e92698451e0 --- /dev/null +++ b/packages/muelu/test/unit_tests/ClassicalPFactory.cpp @@ -0,0 +1,298 @@ +// @HEADER +// +// *********************************************************************** +// +// MueLu: A package for multigrid based preconditioning +// Copyright 2012 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact +// Jonathan Hu (jhu@sandia.gov) +// Andrey Prokopenko (aprokop@sandia.gov) +// Ray Tuminaro (rstumin@sandia.gov) +// +// *********************************************************************** +// +// @HEADER +#include +#include +#include + +#include "MueLu_TestHelpers.hpp" +#include "MueLu_Version.hpp" + +#include +#include +#include +#include +#include + +#include "MueLu_CoalesceDropFactory.hpp" +#include "MueLu_AmalgamationFactory.hpp" +#include "MueLu_ClassicalMapFactory.hpp" +#include "MueLu_ClassicalPFactory.hpp" +#include "MueLu_Utilities.hpp" + + +namespace MueLuTests { + + TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ClassicalPFactory, Constructor, Scalar, LocalOrdinal, GlobalOrdinal, Node) + { +# include "MueLu_UseShortNames.hpp" + MUELU_TESTING_SET_OSTREAM; + MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node); + + out << "version: " << MueLu::Version() << std::endl; + + RCP PFact = rcp(new ClassicalPFactory); + TEST_EQUALITY(PFact != Teuchos::null, true); + + } //Constructor + + + + TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ClassicalPFactory, BuildP_Direct, Scalar, LocalOrdinal, GlobalOrdinal, Node) + { +# include "MueLu_UseShortNames.hpp" + MUELU_TESTING_SET_OSTREAM; + MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node); + + using TST = Teuchos::ScalarTraits; + using magnitude_type = typename TST::magnitudeType; + using TMT = Teuchos::ScalarTraits; + using real = typename TST::coordinateType; + using RealValuedMultiVector = Xpetra::MultiVector; + using test_factory = TestHelpers::TestFactory; + + out << "version: " << MueLu::Version() << std::endl; + + Level fineLevel, coarseLevel; + test_factory::createTwoLevelHierarchy(fineLevel, coarseLevel); + fineLevel.SetFactoryManager(Teuchos::null); // factory manager is not used on this test + coarseLevel.SetFactoryManager(Teuchos::null); + + GO nx = 29; + RCP A = test_factory::Build1DPoisson(nx); + A->SetFixedBlockSize(1); + fineLevel.Set("A", A); + + // This test only works in parallel if we have Zoltan2 & Tpetra +#ifndef HAVE_MUELU_ZOLTAN2 + if(A->getRowMap()->getComm()->getRank() > 1) + return; +#else + if(A->getRowMap()->lib() == Xpetra::UseEpetra) + return; +#endif + + Teuchos::ParameterList galeriList; + galeriList.set("nx", nx); + RCP coordinates + = Galeri::Xpetra::Utils::CreateCartesianCoordinates("1D", A->getRowMap(), galeriList); + fineLevel.Set("Coordinates", coordinates); + + LocalOrdinal NSdim = 2; + RCP nullSpace = MultiVectorFactory::Build(A->getRowMap(),NSdim); + nullSpace->randomize(); + fineLevel.Set("Nullspace", nullSpace); + + RCP amalgFact = rcp(new AmalgamationFactory()); + RCP dropFact = rcp(new CoalesceDropFactory()); + dropFact->SetFactory("UnAmalgamationInfo", amalgFact); + + RCP cmFact = rcp(new ClassicalMapFactory()); + cmFact->SetFactory("Graph", dropFact); + cmFact->SetFactory("UnAmalgamationInfo", amalgFact); + + Teuchos::ParameterList cp_params; + cp_params.set("aggregation: classical scheme","direct"); + RCP PFact = rcp(new ClassicalPFactory()); + PFact->SetParameterList(cp_params); + PFact->SetFactory("UnAmalgamationInfo", amalgFact); + PFact->SetFactory("Graph", dropFact); + PFact->SetFactory("DofsPerNode", dropFact); + PFact->SetFactory("FC Splitting", cmFact); + PFact->SetFactory("CoarseMap", cmFact); + + coarseLevel.Request("P",PFact.get()); // request Ptent + coarseLevel.Request(*PFact); + PFact->Build(fineLevel,coarseLevel); + + RCP P; + coarseLevel.Get("P",P,PFact.get()); + + } //BuildP + + + TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ClassicalPFactory, BuildP_ClassicalModified, Scalar, LocalOrdinal, GlobalOrdinal, Node) + { +# include "MueLu_UseShortNames.hpp" + MUELU_TESTING_SET_OSTREAM; + MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node); + + using TST = Teuchos::ScalarTraits; + using magnitude_type = typename TST::magnitudeType; + using TMT = Teuchos::ScalarTraits; + using real = typename TST::coordinateType; + using RealValuedMultiVector = Xpetra::MultiVector; + using test_factory = TestHelpers::TestFactory; + + out << "version: " << MueLu::Version() << std::endl; + + Level fineLevel, coarseLevel; + test_factory::createTwoLevelHierarchy(fineLevel, coarseLevel); + fineLevel.SetFactoryManager(Teuchos::null); // factory manager is not used on this test + coarseLevel.SetFactoryManager(Teuchos::null); + + GO nx = 29; + RCP A = test_factory::Build1DPoisson(nx); + A->SetFixedBlockSize(1); + fineLevel.Set("A", A); + + Teuchos::ParameterList galeriList; + galeriList.set("nx", nx); + RCP coordinates + = Galeri::Xpetra::Utils::CreateCartesianCoordinates("1D", A->getRowMap(), galeriList); + fineLevel.Set("Coordinates", coordinates); + + LocalOrdinal NSdim = 2; + RCP nullSpace = MultiVectorFactory::Build(A->getRowMap(),NSdim); + nullSpace->randomize(); + fineLevel.Set("Nullspace", nullSpace); + + RCP amalgFact = rcp(new AmalgamationFactory()); + RCP dropFact = rcp(new CoalesceDropFactory()); + dropFact->SetFactory("UnAmalgamationInfo", amalgFact); + + RCP cmFact = rcp(new ClassicalMapFactory()); + cmFact->SetFactory("Graph", dropFact); + cmFact->SetFactory("UnAmalgamationInfo", amalgFact); + + + Teuchos::ParameterList cp_params; + cp_params.set("aggregation: classical scheme","classical modified"); + RCP PFact = rcp(new ClassicalPFactory()); + PFact->SetParameterList(cp_params); + PFact->SetFactory("UnAmalgamationInfo", amalgFact); + PFact->SetFactory("Graph", dropFact); + PFact->SetFactory("DofsPerNode", dropFact); + PFact->SetFactory("FC Splitting", cmFact); + PFact->SetFactory("CoarseMap", cmFact); + + coarseLevel.Request("P",PFact.get()); // request Ptent + coarseLevel.Request(*PFact); + PFact->Build(fineLevel,coarseLevel); + + RCP P; + coarseLevel.Get("P",P,PFact.get()); + + } //BuildP + + TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ClassicalPFactory, BuildP_Ext, Scalar, LocalOrdinal, GlobalOrdinal, Node) + { +# include "MueLu_UseShortNames.hpp" + MUELU_TESTING_SET_OSTREAM; + MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node); + + using TST = Teuchos::ScalarTraits; + using magnitude_type = typename TST::magnitudeType; + using TMT = Teuchos::ScalarTraits; + using real = typename TST::coordinateType; + using RealValuedMultiVector = Xpetra::MultiVector; + using test_factory = TestHelpers::TestFactory; + + out << "version: " << MueLu::Version() << std::endl; + + Level fineLevel, coarseLevel; + test_factory::createTwoLevelHierarchy(fineLevel, coarseLevel); + fineLevel.SetFactoryManager(Teuchos::null); // factory manager is not used on this test + coarseLevel.SetFactoryManager(Teuchos::null); + + GO nx = 29; + RCP A = test_factory::Build1DPoisson(nx); + A->SetFixedBlockSize(1); + fineLevel.Set("A", A); + + Teuchos::ParameterList galeriList; + galeriList.set("nx", nx); + RCP coordinates + = Galeri::Xpetra::Utils::CreateCartesianCoordinates("1D", A->getRowMap(), galeriList); + fineLevel.Set("Coordinates", coordinates); + + LocalOrdinal NSdim = 2; + RCP nullSpace = MultiVectorFactory::Build(A->getRowMap(),NSdim); + nullSpace->randomize(); + fineLevel.Set("Nullspace", nullSpace); + + RCP amalgFact = rcp(new AmalgamationFactory()); + RCP dropFact = rcp(new CoalesceDropFactory()); + dropFact->SetFactory("UnAmalgamationInfo", amalgFact); + + RCP cmFact = rcp(new ClassicalMapFactory()); + cmFact->SetFactory("Graph", dropFact); + cmFact->SetFactory("UnAmalgamationInfo", amalgFact); + + + Teuchos::ParameterList cp_params; + cp_params.set("aggregation: classical scheme","ext+i"); + RCP PFact = rcp(new ClassicalPFactory()); + PFact->SetParameterList(cp_params); + PFact->SetFactory("UnAmalgamationInfo", amalgFact); + PFact->SetFactory("Graph", dropFact); + PFact->SetFactory("DofsPerNode", dropFact); + PFact->SetFactory("FC Splitting", cmFact); + PFact->SetFactory("CoarseMap", cmFact); + + coarseLevel.Request("P",PFact.get()); // request Ptent + coarseLevel.Request(*PFact); + PFact->Build(fineLevel,coarseLevel); + + RCP P; + coarseLevel.Get("P",P,PFact.get()); + + } //BuildP + + +# define MUELU_ETI_GROUP(Scalar, LO, GO, Node) \ + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ClassicalPFactory,Constructor,Scalar,LO,GO,Node) \ + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ClassicalPFactory,BuildP_Direct,Scalar,LO,GO,Node) + + // Disabled until we actually have code to run these +#if 0 + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ClassicalPFactory,BuildP_ClassicalModified,Scalar,LO,GO,Node) \ + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ClassicalPFactory,BuildP_Ext,Scalar,LO,GO,Node) +#endif + +#include + + +} // namespace MueLuTests diff --git a/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.cpp b/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.cpp index 4668b3cfc97d..69fd22ead2eb 100644 --- a/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.cpp +++ b/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.cpp @@ -56,11 +56,14 @@ #include "LOCA_Thyra_Group.H" #include "NOX_TpetraTypedefs.hpp" #include "NOX_Thyra_MultiVector.H" +#include "Thyra_TpetraVectorSpace.hpp" #include "Thyra_TpetraMultiVector.hpp" #include "Thyra_TpetraLinearOp.hpp" +#include "Thyra_DefaultLinearOpSource.hpp" #include "LOCA_Tpetra_LowRankUpdateRowMatrix.hpp" #include "Teuchos_ParameterList.hpp" +#include "Teuchos_StandardParameterEntryValidators.hpp" #include "LOCA_BorderedSolver_LowerTriangularBlockElimination.H" #include "LOCA_BorderedSolver_UpperTriangularBlockElimination.H" #include "LOCA_Abstract_TransposeSolveGroup.H" @@ -71,6 +74,16 @@ // To suppress unreachable return warnings on cuda #include "Teuchos_CompilerCodeTweakMacros.hpp" +// For debugging output +#include + +// Forward declaration needed for ParameterList validation +namespace LOCA { + namespace MultiContinuation { + class ConstraintModelEvaluator; + } +} + // Utility for extracting tpetra vector from nox vector using ST = NOX::Scalar; using LO = NOX::LocalOrdinal; @@ -155,9 +168,23 @@ TpetraHouseholder(const Teuchos::RCP& global_data, isComplex(false), omega(0.0) { - scale_rows = solverParams->get("Scale Augmented Rows", true); + Teuchos::ParameterList validParams; + validParams.set("Bordered Solver Method", "Householder"); + validParams.set("Constraint Object",Teuchos::RCP(Teuchos::null)); + validParams.set("Constraint Parameter Names",Teuchos::RCP>(Teuchos::null)); + validParams.set("Scale Augmented Rows", true); + Teuchos::setStringToIntegralParameter("Preconditioner Method", + "Jacobian", + "Matrix to use for Preconditioning", + Teuchos::tuple ("Jacobian","SWM"), + &validParams); + validParams.set("Include UV In Preconditioner", false); + validParams.set("Use P For Preconditioner", false); + solverParams->validateParametersAndSetDefaults(validParams); + + scale_rows = solverParams->get("Scale Augmented Rows"); std::string prec_method = - solverParams->get("Preconditioner Method", "Jacobian"); + solverParams->get("Preconditioner Method"); if (prec_method == "Jacobian") precMethod = JACOBIAN; else if (prec_method == "SMW") @@ -166,10 +193,11 @@ TpetraHouseholder(const Teuchos::RCP& global_data, globalData->locaErrorCheck->throwError( "LOCA::BorderedSolver::TpetraHouseholder::TpetraHouseholder()", "Unknown preconditioner method! Choices are Jacobian, SMW"); + includeUV = - solverParams->get("Include UV In Preconditioner", false); + solverParams->get("Include UV In Preconditioner"); use_P_For_Prec = - solverParams->get("Use P For Preconditioner", false); + solverParams->get("Use P For Preconditioner"); } LOCA::BorderedSolver::TpetraHouseholder::~TpetraHouseholder() @@ -725,14 +753,61 @@ LOCA::BorderedSolver::TpetraHouseholder::solve( // false)); } - // Overwrite J with J + U*V^T if it's a CRS matrix and we aren't - // using P for the preconditioner - Teuchos::RCP jac_crs; + // Allocate a separate matrix for the preconditioner. Don't want to + // corrupt J with U*V^T if not using P for Prec (we can't for + // tpetra). Copy J values and add in U*V^T if it's a CRS matrix + // and we aren't using P for the preconditioner if (includeUV && !use_P_For_Prec) { - jac_crs = Teuchos::rcp_dynamic_cast(tpetraOp); - if (jac_crs != Teuchos::null) { - updateJacobianForPreconditioner(*U, *V, *jac_crs); + auto jac_crs = Teuchos::rcp_dynamic_cast(tpetraOp,true); + if (tpetraPrecMatrix.is_null()) { + tpetraPrecMatrix = Teuchos::rcp(new NOX::TCrsMatrix(*jac_crs,Teuchos::Copy)); + Teuchos::RCP<::Thyra::VectorSpaceBase> domain = ::Thyra::tpetraVectorSpace(tpetraPrecMatrix->getDomainMap()); + Teuchos::RCP<::Thyra::VectorSpaceBase> range = ::Thyra::tpetraVectorSpace(tpetraPrecMatrix->getRangeMap()); + auto prec_thyra_op = Teuchos::rcp(new ::Thyra::TpetraLinearOp); + prec_thyra_op->initialize(range,domain,tpetraPrecMatrix); + Teuchos::RCP<::Thyra::LinearOpBase> tmp_for_thyra_ambiguity = prec_thyra_op; + prec_losb = Teuchos::rcp(new ::Thyra::DefaultLinearOpSource(tmp_for_thyra_ambiguity)); + } + + // Copy J values into preconditioner matrix + //*tpetraPrecMatrix = *jac_crs; + { + tpetraPrecMatrix->resumeFill(); + auto jac_view = jac_crs->getLocalMatrix().values; + auto prec_view = tpetraPrecMatrix->getLocalMatrix().values; + Kokkos::deep_copy(prec_view,jac_view); + tpetraPrecMatrix->fillComplete(); + } + + bool print_debug = false; + if (print_debug) { + std::fstream fsj("jac_matrix_before.out",std::fstream::out|std::fstream::trunc); + Teuchos::FancyOStream tfsj(Teuchos::rcpFromRef(fsj)); + jac_crs->describe(tfsj,Teuchos::VERB_EXTREME); + std::fstream fsp("prec_matrix_before.out",std::fstream::out|std::fstream::trunc); + Teuchos::FancyOStream tfsp(Teuchos::rcpFromRef(fsp)); + tpetraPrecMatrix->describe(tfsp,Teuchos::VERB_EXTREME); + std::fstream fsu("u_vector.out",std::fstream::out|std::fstream::trunc); + Teuchos::FancyOStream tfsu(Teuchos::rcpFromRef(fsu)); + tpetra_U->describe(tfsu,Teuchos::VERB_EXTREME); + std::fstream fsv("v_vector.out",std::fstream::out|std::fstream::trunc); + Teuchos::FancyOStream tfsv(Teuchos::rcpFromRef(fsv)); + tpetra_V->describe(tfsv,Teuchos::VERB_EXTREME); + } + + // Update locally owned non-zero values for U*V^T + updateCrsMatrixForPreconditioner(*U, *V, *tpetraPrecMatrix); + + if (print_debug) { + std::fstream fsj("jac_matrix_after.out",std::fstream::out|std::fstream::trunc); + Teuchos::FancyOStream tfsj(Teuchos::rcpFromRef(fsj)); + jac_crs->describe(tfsj,Teuchos::VERB_EXTREME); + std::fstream fsp("prec_matrix_after.out",std::fstream::out|std::fstream::trunc); + Teuchos::FancyOStream tfsp(Teuchos::rcpFromRef(fsp)); + tpetraPrecMatrix->describe(tfsp,Teuchos::VERB_EXTREME); } + + grp->setPreconditionerMatrix(prec_losb); } // Set operator in solver to compute preconditioner @@ -1032,13 +1107,58 @@ LOCA::BorderedSolver::TpetraHouseholder::computeUV( } void -LOCA::BorderedSolver::TpetraHouseholder::updateJacobianForPreconditioner( - const NOX::Abstract::MultiVector& UU, - const NOX::Abstract::MultiVector& VV, - NOX::TCrsMatrix& jac) const +LOCA::BorderedSolver::TpetraHouseholder:: +updateCrsMatrixForPreconditioner(const NOX::Abstract::MultiVector& UU, + const NOX::Abstract::MultiVector& VV, + NOX::TCrsMatrix& matrix) const { - TEUCHOS_TEST_FOR_EXCEPTION(true,std::runtime_error, - "ERROR: LOCA::BorderedSolver::TpetraHouseholder::updateJacobianForPreconditioner - NOT IMPLEMENTED YET!"); + matrix.resumeFill(); + + auto& UU_tpetra = NOX::Tpetra::getTpetraMultiVector(UU); + auto& VV_tpetra = NOX::Tpetra::getTpetraMultiVector(VV); + const_cast(UU_tpetra).sync_device(); + const_cast(VV_tpetra).sync_device(); + const auto uu = UU_tpetra.getLocalViewDevice(); + const auto vv = VV_tpetra.getLocalViewDevice(); + + const auto numRows = matrix.getNodeNumRows(); + const auto rowMap = matrix.getRowMap()->getLocalMap(); + const auto colMap = matrix.getColMap()->getLocalMap(); + const auto uMap = UU_tpetra.getMap()->getLocalMap(); + const auto vMap = VV_tpetra.getMap()->getLocalMap(); + auto J_view = matrix.getLocalMatrix(); + auto numConstraintsLocal = numConstraints; // for cuda lambda capture + + TEUCHOS_ASSERT(static_cast(matrix.getRowMap()->getNodeNumElements()) == uu.extent(0)); + TEUCHOS_ASSERT(static_cast(matrix.getRowMap()->getNodeNumElements()) == vv.extent(0)); + TEUCHOS_ASSERT(numConstraintsLocal == static_cast(uu.extent(1))); + TEUCHOS_ASSERT(numConstraintsLocal == static_cast(vv.extent(1))); + + Kokkos::parallel_for("Add UV^T to M",Kokkos::RangePolicy(0,numRows),KOKKOS_LAMBDA (const int row) { + const GO row_gid = rowMap.getGlobalElement(row); + const LO u_row_lid = uMap.getLocalElement(row_gid); + auto rowView = J_view.row(row); + + const auto numEntries = rowView.length; + for (int col=0; col::invalid()) { + + // val = sum_{k=0}^m U(i,k)*V(j,k) + ST val = 0.0; + for (int k=0; k class DefaultLinearOpSource; +} namespace LOCA { class GlobalData; namespace Parameter { @@ -421,14 +424,18 @@ namespace LOCA { NOX::Abstract::MultiVector& V, bool use_jac_transpose); + public: /*! * \brief Overwrites the Jacobian \f$J\f$ with \f$J + U V^T\f$ * for computing the preconditioner of \f$P\f$. + * + * NOTE: This should be a protected method, but cuda lambda forces this to be public! */ - void updateJacobianForPreconditioner(const NOX::Abstract::MultiVector& U, - const NOX::Abstract::MultiVector& V, - NOX::TCrsMatrix& jac) const; + void updateCrsMatrixForPreconditioner(const NOX::Abstract::MultiVector& U, + const NOX::Abstract::MultiVector& V, + NOX::TCrsMatrix& mat) const; + protected: Teuchos::RCP createBlockMV(const NOX::Abstract::MultiVector& v) const; @@ -534,7 +541,10 @@ namespace LOCA { Teuchos::RCP tpetraOp; //! Pointer to Tpetra Preconditioner operator - Teuchos::RCP tpetraPrecOp; + mutable Teuchos::RCP tpetraPrecMatrix; + + //! Thyra wrapped preconditioner matrix (tpetraPrecMatrix) for when includeUV is true and use_P_for_Prec is false + mutable Teuchos::RCP<::Thyra::DefaultLinearOpSource> prec_losb; //! Number of constraint equations int numConstraints; @@ -583,7 +593,6 @@ namespace LOCA { //! Frequency for complex systems double omega; - }; } // namespace BorderedSolver } // namespace LOCA diff --git a/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.cpp b/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.cpp index fe7e9c861c59..a8ddbb534a48 100644 --- a/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.cpp +++ b/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.cpp @@ -102,6 +102,44 @@ namespace LOCA { bool LowRankUpdateRowMatrix::supportsRowViews() const {return J_rowMatrix->supportsRowViews();} + void + LowRankUpdateRowMatrix::getGlobalRowCopy(NOX::GlobalOrdinal GlobalRow, + NOX::TRowMatrix::nonconst_global_inds_host_view_type &Indices, + NOX::TRowMatrix::nonconst_values_host_view_type &Values, + size_t &NumEntries) const + { + TEUCHOS_TEST_FOR_EXCEPTION(true,std::runtime_error, + "ERROR - LOCA::LowRankRowMatrix::getGlobalRowCopy() - NOT implemented yet!"); + } + + void + LowRankUpdateRowMatrix::getLocalRowCopy (NOX::LocalOrdinal LocalRow, + NOX::TRowMatrix::nonconst_local_inds_host_view_type &Indices, + NOX::TRowMatrix::nonconst_values_host_view_type &Values, + size_t &NumEntries) const + { + TEUCHOS_TEST_FOR_EXCEPTION(true,std::runtime_error, + "ERROR - LOCA::LowRankRowMatrix::getLocalRowCopy() - NOT implemented yet!"); + } + + void + LowRankUpdateRowMatrix::getGlobalRowView (NOX::GlobalOrdinal GlobalRow, + NOX::TRowMatrix::global_inds_host_view_type &indices, + NOX::TRowMatrix::values_host_view_type &values) const + { + TEUCHOS_TEST_FOR_EXCEPTION(true,std::runtime_error, + "ERROR - LOCA::LowRankRowMatrix::getGlobalRowView() - NOT implemented yet!"); + } + + void + LowRankUpdateRowMatrix::getLocalRowView(NOX::LocalOrdinal LocalRow, + NOX::TRowMatrix::local_inds_host_view_type &indices, + NOX::TRowMatrix::values_host_view_type &values) const + { + TEUCHOS_TEST_FOR_EXCEPTION(true,std::runtime_error, + "ERROR - LOCA::LowRankRowMatrix::getLocalRowView() - NOT implemented yet!"); + } + void LowRankUpdateRowMatrix::getGlobalRowCopy(NOX::GlobalOrdinal GlobalRow, const Teuchos::ArrayView &Indices, diff --git a/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.hpp b/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.hpp index 28b091979b66..1ccf3984a935 100644 --- a/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.hpp +++ b/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.hpp @@ -91,6 +91,24 @@ namespace LOCA { virtual bool isFillComplete() const override; virtual bool supportsRowViews() const override; virtual void + getGlobalRowCopy (NOX::GlobalOrdinal GlobalRow, + NOX::TRowMatrix::nonconst_global_inds_host_view_type &Indices, + NOX::TRowMatrix::nonconst_values_host_view_type &Values, + size_t &NumEntries) const override; + virtual void + getLocalRowCopy (NOX::LocalOrdinal LocalRow, + NOX::TRowMatrix::nonconst_local_inds_host_view_type &Indices, + NOX::TRowMatrix::nonconst_values_host_view_type &Values, + size_t &NumEntries) const override; + virtual void + getGlobalRowView (NOX::GlobalOrdinal GlobalRow, + NOX::TRowMatrix::global_inds_host_view_type &Indices, + NOX::TRowMatrix::values_host_view_type &Values) const override; + virtual void + getLocalRowView (NOX::LocalOrdinal LocalRow, + NOX::TRowMatrix::local_inds_host_view_type &Indices, + NOX::TRowMatrix::values_host_view_type &Values) const override; + virtual void getGlobalRowCopy (NOX::GlobalOrdinal GlobalRow, const Teuchos::ArrayView &Indices, const Teuchos::ArrayView &Values, @@ -133,13 +151,13 @@ namespace LOCA { //*************************************** // Derived from Tpetra::Operator interface //*************************************** - virtual Teuchos::RCP getDomainMap() const; - virtual Teuchos::RCP getRangeMap() const; + virtual Teuchos::RCP getDomainMap() const override; + virtual Teuchos::RCP getRangeMap() const override; virtual void apply(const NOX::TMultiVector &X, NOX::TMultiVector &Y, Teuchos::ETransp mode = Teuchos::NO_TRANS, NOX::Scalar alpha = Teuchos::ScalarTraits::one(), - NOX::Scalar beta = Teuchos::ScalarTraits::zero()) const; + NOX::Scalar beta = Teuchos::ScalarTraits::zero()) const override; protected: diff --git a/packages/nox/src-thyra/NOX_Thyra_Group.C b/packages/nox/src-thyra/NOX_Thyra_Group.C index f8839f7a4a8e..83dedfe0d0d5 100644 --- a/packages/nox/src-thyra/NOX_Thyra_Group.C +++ b/packages/nox/src-thyra/NOX_Thyra_Group.C @@ -436,9 +436,9 @@ void NOX::Thyra::Group::setJacobianOperator(const Teuchos::RCP<::Thyra::LinearOp lop_ = op; } -void NOX::Thyra::Group::setPreconditionerOperator(const Teuchos::RCP<::Thyra::PreconditionerBase>& op) +void NOX::Thyra::Group::setPreconditionerMatrix(const Teuchos::RCP>& op) { - prec_ = op; + losb_ = op; } void NOX::Thyra::Group::setX(const NOX::Abstract::Vector& y) diff --git a/packages/nox/src-thyra/NOX_Thyra_Group.H b/packages/nox/src-thyra/NOX_Thyra_Group.H index 6d8738bd2487..72e047c2fa7c 100644 --- a/packages/nox/src-thyra/NOX_Thyra_Group.H +++ b/packages/nox/src-thyra/NOX_Thyra_Group.H @@ -191,8 +191,8 @@ namespace NOX { /// Dangerous power user function for LOCA Householder bordered algorithm. void setJacobianOperator(const Teuchos::RCP<::Thyra::LinearOpBase>& op); - /// Dangerous power user function for LOCA Householder bordered algorithm. - void setPreconditionerOperator(const Teuchos::RCP<::Thyra::PreconditionerBase>& op); + /// Dangerous power user function for LOCA Householder bordered algorithm. This is the Matrix M that is used to initialize a stratimikos preconditioner. NOTE: this sets the losb_ object used to update prec_! + void setPreconditionerMatrix(const Teuchos::RCP>& op); /** @name "Compute" functions. */ //@{ diff --git a/packages/nox/test/tpetra/CMakeLists.txt b/packages/nox/test/tpetra/CMakeLists.txt index 46dd0b11452f..efb2602b15f6 100644 --- a/packages/nox/test/tpetra/CMakeLists.txt +++ b/packages/nox/test/tpetra/CMakeLists.txt @@ -50,6 +50,11 @@ IF(NOX_ENABLE_ABSTRACT_IMPLEMENTATION_THYRA AND SOURCES ${UNIT_TEST_DRIVER} ME_Tpetra_1DFEM.hpp ME_Tpetra_1DFEM_def.hpp tTpetra_HouseholderBorderedSolve.cpp ) + TRIBITS_ADD_EXECUTABLE_AND_TEST( + Tpetra_HouseholderBorderedSolve_WithUVInPrec + SOURCES ${UNIT_TEST_DRIVER} ME_Tpetra_1DFEM.hpp ME_Tpetra_1DFEM_def.hpp tTpetra_HouseholderBorderedSolve_WithUVInPrec.cpp + ) + TRIBITS_ADD_EXECUTABLE_AND_TEST( Tpetra_ConstraintModelEvaluator SOURCES ${UNIT_TEST_DRIVER} ME_Tpetra_1DFEM.hpp ME_Tpetra_1DFEM_def.hpp tTpetra_ConstraintModelEvaluator.cpp diff --git a/packages/nox/test/tpetra/tTpetra_HouseholderBorderedSolve_WithUVInPrec.cpp b/packages/nox/test/tpetra/tTpetra_HouseholderBorderedSolve_WithUVInPrec.cpp new file mode 100644 index 000000000000..0710b2ce08d2 --- /dev/null +++ b/packages/nox/test/tpetra/tTpetra_HouseholderBorderedSolve_WithUVInPrec.cpp @@ -0,0 +1,310 @@ +//@HEADER +// ************************************************************************ +// +// NOX: An Object-Oriented Nonlinear Solver Package +// Copyright (2002) Sandia Corporation +// +// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive +// license for use of this work by or on behalf of the U.S. Government. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Roger Pawlowski (rppawlo@sandia.gov) or +// Eric Phipps (etphipp@sandia.gov), Sandia National Laboratories. +// ************************************************************************ +// CVS Information +// $Source$ +// $Author$ +// $Date$ +// $Revision$ +// ************************************************************************ +//@HEADER +#include "Teuchos_ConfigDefs.hpp" +#include "Teuchos_UnitTestHarness.hpp" +#include "Teuchos_StackedTimer.hpp" + +// NOX Objects +#include "NOX.H" +#include "NOX_Thyra.H" + +// Trilinos Objects +#include "Teuchos_Comm.hpp" +#include "Teuchos_ParameterList.hpp" +#include "Teuchos_RCP.hpp" +#include "Teuchos_FancyOStream.hpp" +#include "Teuchos_AbstractFactoryStd.hpp" + +#include "Tpetra_Core.hpp" +#include "Tpetra_Vector.hpp" + +#include "BelosTypes.hpp" +#include "Stratimikos_DefaultLinearSolverBuilder.hpp" +#include "Thyra_LinearOpWithSolveFactoryHelpers.hpp" +#include "Thyra_Ifpack2PreconditionerFactory.hpp" +#include "ME_Tpetra_1DFEM.hpp" + +#include "NOX_Thyra_MatrixFreeJacobianOperator.hpp" +#include "NOX_MatrixFree_ModelEvaluatorDecorator.hpp" +#include "NOX_TpetraTypedefs.hpp" +#include "LOCA_Tpetra_Factory.hpp" +#include "LOCA_Thyra_Group.H" +#include "LOCA_MultiContinuation_ConstrainedGroup.H" +#include "LOCA_Tpetra_ConstraintModelEvaluator.hpp" +#include "LOCA_Parameter_SublistParser.H" +#include "NOX_SolverStats.hpp" + +// For solution io +#include "Thyra_TpetraVector.hpp" +#include +#include + +TEUCHOS_UNIT_TEST(NOX_Tpetra_Householder, BasicSolve) +{ + Teuchos::RCP > comm = Tpetra::getDefaultComm(); + + // Get default Tpetra template types + using Scalar = NOX::Scalar; + using LO = NOX::LocalOrdinal; + using GO = NOX::GlobalOrdinal; + using Node = NOX::NodeType; + + // Create the model evaluator object + Scalar x00 = 0.0; + Scalar x01 = 1.0; + const Tpetra::global_size_t numGlobalElements = 100; + Teuchos::RCP > model = + evaluatorTpetra1DFEM(comm, numGlobalElements, x00, x01); + + // Create the linear solver and register on model evaluator + { + Stratimikos::DefaultLinearSolverBuilder builder; + typedef Thyra::PreconditionerFactoryBase Base; + typedef Thyra::Ifpack2PreconditionerFactory > Impl; + builder.setPreconditioningStrategyFactory(Teuchos::abstractFactoryStd(), "Ifpack2"); + + Teuchos::RCP p = Teuchos::parameterList(); + p->set("Linear Solver Type", "Belos"); + Teuchos::ParameterList& belosList = p->sublist("Linear Solver Types").sublist("Belos"); + belosList.set("Solver Type", "Pseudo Block GMRES"); + belosList.sublist("Solver Types").sublist("Pseudo Block GMRES").set("Maximum Iterations", 200); + belosList.sublist("Solver Types").sublist("Pseudo Block GMRES").set("Num Blocks", 200); + belosList.sublist("Solver Types").sublist("Pseudo Block GMRES").set("Verbosity", Belos::Errors+Belos::IterationDetails+Belos::FinalSummary); + belosList.sublist("Solver Types").sublist("Pseudo Block GMRES").set("Output Frequency", 5); + belosList.sublist("VerboseObject").set("Verbosity Level", "medium"); + p->set("Preconditioner Type", "Ifpack2"); + // p->set("Preconditioner Type", "None"); + Teuchos::ParameterList& ifpackList = p->sublist("Preconditioner Types").sublist("Ifpack2"); + ifpackList.set("Prec Type", "ILUT"); + + builder.setParameterList(p); + + Teuchos::RCP > + lowsFactory = builder.createLinearSolveStrategy(""); + + model->set_W_factory(lowsFactory); + } + + // Create the initial guess + Teuchos::RCP > + initial_guess = model->getNominalValues().get_x()->clone_v(); + Thyra::V_S(initial_guess.ptr(),Teuchos::ScalarTraits::one()); + + // Create top level nox/loca solver parameter list + Teuchos::RCP pList = Teuchos::parameterList("Top Level"); + + // Create nox parameter list + auto& nl_params = pList->sublist("NOX"); + nl_params.set("Nonlinear Solver", "Line Search Based"); + nl_params.sublist("Direction").sublist("Newton").sublist("Linear Solver").set("Tolerance", 1.0e-8); + auto& ls_params = nl_params.sublist("Line Search"); + ls_params.set("Method","Full Step"); + auto& output_list = nl_params.sublist("Printing").sublist("Output Information"); + output_list.set("Debug",true); + output_list.set("Warning",true); + output_list.set("Error",true); + output_list.set("Test Details",true); + output_list.set("Details",true); + output_list.set("Parameters",true); + output_list.set("Linear Solver Details",true); + output_list.set("Inner Iteration",true); + output_list.set("Outer Iteration",true); + output_list.set("Outer Iteration StatusTest",true); + + // Create the LOCA Group: + // (NOX::Thyra::Group-->LOCA::Thyra::Group-->LOCA::Constrained::Group) + // For Tpetra Householder, we need to actively set the + // preconditioner and preconditioner factory so that it uses the + // precOp separate from the Jacobian operator. Householder replaces + // the Jacobian operator with a matrix-free version that has the + // uv^T tacked on. + auto explicit_jacobian = model->create_W_op(); + auto prec_matrix = Teuchos::rcp(new Thyra::DefaultPreconditioner(Teuchos::null,explicit_jacobian)); + TEST_ASSERT(nonnull(model->get_W_factory()->getPreconditionerFactory())); + Teuchos::RCP nox_group = + Teuchos::rcp(new NOX::Thyra::Group(*initial_guess, + model, + explicit_jacobian, + model->get_W_factory(), + prec_matrix, // Reuse Jac for approx preconditioner + model->get_W_factory()->getPreconditionerFactory(), + Teuchos::null)); + + Teuchos::RCP tpetra_factory = Teuchos::rcp(new LOCA::Tpetra::Factory); + + Teuchos::RCP global_data = LOCA::createGlobalData(pList, tpetra_factory); + + Teuchos::RCP p_vec = Teuchos::rcp(new LOCA::ParameterVector); + p_vec->addParameter("k", 1.0); // Source term multiplier + p_vec->addParameter("T_left", 1.2); // Source term multiplier + + std::vector me_p_indices; + me_p_indices.push_back(2); + me_p_indices.push_back(4); + Teuchos::RCP loca_group = Teuchos::rcp(new LOCA::Thyra::Group(global_data, + *nox_group, + *p_vec, + me_p_indices)); + + auto g_names = Teuchos::rcp(new std::vector); + g_names->push_back("Constraint: T_right=2"); + g_names->push_back("Constraint: 2*T_left=T_right"); + auto x_thyra = ::Thyra::createMember(model->get_x_space(),"x"); + NOX::Thyra::Vector x(x_thyra); + auto constraints = Teuchos::rcp(new LOCA::MultiContinuation::ConstraintModelEvaluator(model,*p_vec,*g_names,x)); + + // Set initial parameter conditions + constraints->setX(x); + constraints->setParam(0,1.0); + constraints->setParam(1,1.2); + + // Create the constraints list + auto& locaParamsList = pList->sublist("LOCA"); + auto& constraint_list = locaParamsList.sublist("Constraints"); + constraint_list.set("Bordered Solver Method", "Householder"); + constraint_list.set("Constraint Object", constraints); + constraint_list.set("Constraint Parameter Names", g_names); + constraint_list.set("Include UV In Preconditioner",true); + + auto loca_parser = Teuchos::rcp(new LOCA::Parameter::SublistParser(global_data)); + loca_parser->parseSublists(pList); + + std::vector param_ids(2); + param_ids[0] = 0; + param_ids[1] = 1; + auto constraint_list_ptr = Teuchos::rcpFromRef(constraint_list); + Teuchos::RCP loca_constrained_group = + Teuchos::rcp(new LOCA::MultiContinuation::ConstrainedGroup(global_data, + loca_parser, + constraint_list_ptr, + loca_group, + constraints, + param_ids, + false)); + + loca_constrained_group->computeF(); + + // Create the NOX status tests and the solver + // Create the convergence tests + Teuchos::RCP absresid = + Teuchos::rcp(new NOX::StatusTest::NormF(1.0e-8)); + Teuchos::RCP wrms = + Teuchos::rcp(new NOX::StatusTest::NormWRMS(1.0e-2, 1.0e-8)); + Teuchos::RCP converged = + Teuchos::rcp(new NOX::StatusTest::Combo(NOX::StatusTest::Combo::AND)); + converged->addStatusTest(absresid); + converged->addStatusTest(wrms); + Teuchos::RCP maxiters = + Teuchos::rcp(new NOX::StatusTest::MaxIters(10)); + Teuchos::RCP fv = + Teuchos::rcp(new NOX::StatusTest::FiniteValue); + Teuchos::RCP combo = + Teuchos::rcp(new NOX::StatusTest::Combo(NOX::StatusTest::Combo::OR)); + combo->addStatusTest(fv); + combo->addStatusTest(converged); + combo->addStatusTest(maxiters); + + // Create the solver + // auto solver = NOX::Solver::buildSolver(nox_group, combo, Teuchos::rcpFromRef(pList->sublist("NOX"))); + // auto solver = NOX::Solver::buildSolver(loca_group, combo, Teuchos::rcpFromRef(pList->sublist("NOX"))); + auto solver = NOX::Solver::buildSolver(loca_constrained_group, combo, Teuchos::rcpFromRef(pList->sublist("NOX"))); + + NOX::StatusTest::StatusType solvStatus = solver->solve(); + + // Output + { + Teuchos::TimeMonitor::getStackedTimer()->stopBaseTimer(); + Teuchos::StackedTimer::OutputOptions options; + options.output_fraction = true; + options.output_minmax = true; + Teuchos::TimeMonitor::getStackedTimer()->report(out,comm,options); + } + + // Write solution to file + const bool printSolution = true; + if (printSolution) { + for (int i=0; i < comm->getSize(); ++i) { + if (comm->getRank() == i) { + std::ofstream file; + if (comm->getRank() == 0) + file.open("householder_solution.txt",std::ios::trunc); + else + file.open("householder_solution.txt",std::ios::app); + + const auto& final_x = solver->getSolutionGroup().getX(); + const auto& final_x_nox = *(dynamic_cast(final_x).getXVec()); + const auto& final_x_thyra = dynamic_cast(final_x_nox).getThyraVector(); + const auto& final_x_tpetra_const = *(dynamic_cast&>(final_x_thyra).getConstTpetraVector()); + auto& final_x_tpetra = const_cast<::Tpetra::Vector&>(final_x_tpetra_const); + final_x_tpetra.sync_host(); + const auto& final_x_view = final_x_tpetra.getLocalViewHost(); + for (size_t j=0; j < final_x_view.extent(0); ++j) + file << final_x_view(j,0) << std::endl; + } + comm->barrier(); + } + } + + TEST_ASSERT(solvStatus == NOX::StatusTest::Converged); + TEST_EQUALITY(solver->getSolverStatistics()->numNonlinearIterations,5); + + // Check final values + { + const auto& group = solver->getSolutionGroup(); + const auto& c_group = dynamic_cast(group); + + out << "\nFinal Parameter Value for \"k\" = " << std::setprecision(10) << c_group.getParam(0) << std::endl; + out << "Final Parameter Value for \"T_left\" = " << std::setprecision(10) << c_group.getParam(1) << std::endl; + + const double tol = 1.0e-3; + TEST_FLOATING_EQUALITY(c_group.getParam(0),-0.5993277206,tol); + TEST_FLOATING_EQUALITY(c_group.getParam(1),1.0,tol); + } + + // Breaks RCP cyclic dependency + LOCA::destroyGlobalData(global_data); +} diff --git a/packages/panzer/doc/Doxyfile b/packages/panzer/doc/Doxyfile index 15f4409367e7..5b06862638a5 100644 --- a/packages/panzer/doc/Doxyfile +++ b/packages/panzer/doc/Doxyfile @@ -11,7 +11,7 @@ TAGFILES += \ $(TRILINOS_HOME)/packages/common/tag_files/ifpack.tag=$(TRILINOS_HOME)/packages/ifpack/doc/html \ $(TRILINOS_HOME)/packages/common/tag_files/ml.tag=$(TRILINOS_HOME)/packages/ml/doc/html \ $(TRILINOS_HOME)/packages/common/tag_files/nox.tag=$(TRILINOS_HOME)/packages/nox/doc/html \ - $(TRILINOS_HOME)/packages/common/tag_files/ml.tag=$(TRILINOS_HOME)/packages/phalanx/doc/html + $(TRILINOS_HOME)/packages/common/tag_files/phalanx.tag=$(TRILINOS_HOME)/packages/phalanx/doc/html # # Package options # diff --git a/packages/percept/src/adapt/FixSideSets.cpp b/packages/percept/src/adapt/FixSideSets.cpp index e1156fb68b40..d3d8a81b9656 100644 --- a/packages/percept/src/adapt/FixSideSets.cpp +++ b/packages/percept/src/adapt/FixSideSets.cpp @@ -881,6 +881,11 @@ namespace percept { reduced_mod_end = false; (void)reduced_mod_end; + bool skip_side_part_fixes = false; + if (m_eMesh.getProperty("Refiner_skip_side_part_fixes") == "true") + skip_side_part_fixes = true; + + // loop over all sides that are leaves (not parent or have no family tree), // loop over their nodes and their associated elements, // connect element and side if they share a face @@ -909,6 +914,7 @@ namespace percept { fix_permutation(side_set); end_begin(msg+"moveSides"); - move_sides_to_correct_surfaces(); + if (!skip_side_part_fixes) + move_sides_to_correct_surfaces(); } } diff --git a/packages/percept/src/adapt/SerializeNodeRegistry.hpp b/packages/percept/src/adapt/SerializeNodeRegistry.hpp index 43123ad13519..02e7b260030d 100644 --- a/packages/percept/src/adapt/SerializeNodeRegistry.hpp +++ b/packages/percept/src/adapt/SerializeNodeRegistry.hpp @@ -322,10 +322,10 @@ { for(YAML::const_iterator iter=doc.begin();iter!=doc.end();++iter) { - const YAML::Node& key = iter->first; + const YAML::Node key = iter->first; PartName part_name = key.as(); - const YAML::Node& valSeq = iter->second; + const YAML::Node valSeq = iter->second; UInt rank_input; TopologyName topo_name; YAML::const_iterator itv=valSeq.begin(); @@ -334,7 +334,7 @@ topo_name = itv->as(); ++itv; stk::mesh::EntityRank rank = static_cast(rank_input); - const YAML::Node& subsetSeq = *itv; + const YAML::Node subsetSeq = *itv; YAML::const_iterator iss; PartSubsets subsets; for (iss = subsetSeq.begin(); iss != subsetSeq.end(); ++iss) @@ -503,10 +503,10 @@ { for(YAML::const_iterator iter=doc.begin();iter!=doc.end();++iter) { - const YAML::Node& key = iter->first; + const YAML::Node key = iter->first; stk::mesh::EntityId id = key.as(); NodeMapValue procs; - const YAML::Node& val = iter->second; + const YAML::Node val = iter->second; procs = val.as(); //std::cout << "readNodeMap id= " << id << " procs= " << procs << std::endl; if (is_local && procs.size() != 1) @@ -1645,7 +1645,7 @@ //if (DEBUG_YAML) std::cout << "it.first().Type() = " << it.first().Type() << " it.first().Tag()= " << it.first().Tag() << std::endl; //if (DEBUG_YAML) std::cout << "it.second().Type() = " << it.second().Type() << " it.second().Tag()= " << it.second().Tag() << std::endl; - const YAML::Node& keySeq = it->first; + const YAML::Node keySeq = it->first; for(YAML::const_iterator itk=keySeq.begin();itk!=keySeq.end();++itk) { key_quantum = itk->as(); if (DEBUG_YAML) std::cout << "s_r key_quantum= " << key_quantum << std::endl; @@ -1667,7 +1667,7 @@ } int iseq=0; - const YAML::Node& valSeq = it->second; + const YAML::Node valSeq = it->second; stk::mesh::EntityRank rank = stk::topology::INVALID_RANK; size_t id; for(YAML::const_iterator itv=valSeq.begin();itv!=valSeq.end();++itv,++iseq) { diff --git a/packages/percept/src/adapt/TransitionElementAdapter.hpp b/packages/percept/src/adapt/TransitionElementAdapter.hpp index 4ed706754a81..bc397071a7ff 100644 --- a/packages/percept/src/adapt/TransitionElementAdapter.hpp +++ b/packages/percept/src/adapt/TransitionElementAdapter.hpp @@ -26,7 +26,7 @@ #include "/usr/netpub/valgrind-3.8.1/include/valgrind/callgrind.h" #endif -#define DO_ALT_TIMER 1 +#define DO_ALT_TIMER 0 #define TIMING(code) code #define TIMER(name) stk::diag::Timer timer ## name ( #name, Base::rootTimer()); stk::diag::TimeBlock tbTimer ## name (timer ## name) diff --git a/packages/percept/src/adapt/UniformRefinerPattern.cpp b/packages/percept/src/adapt/UniformRefinerPattern.cpp index c6c9f079d804..808e82278751 100644 --- a/packages/percept/src/adapt/UniformRefinerPattern.cpp +++ b/packages/percept/src/adapt/UniformRefinerPattern.cpp @@ -865,6 +865,9 @@ if (eMesh.get_spatial_dim() == 2) return; + if (eMesh.getProperty("Refiner_skip_side_part_fixes") == "true") + return; + std::vector surfaces = eMesh.get_fem_meta_data()->get_surfaces_in_surface_to_block_map(); for (unsigned isu = 0; isu < surfaces.size(); ++isu) { diff --git a/packages/percept/src/adapt/main/MeshAdapt.cpp b/packages/percept/src/adapt/main/MeshAdapt.cpp index d7eabdaa3d9b..e782d24d20b7 100644 --- a/packages/percept/src/adapt/main/MeshAdapt.cpp +++ b/packages/percept/src/adapt/main/MeshAdapt.cpp @@ -2352,7 +2352,7 @@ void MeshAdapt::initialize_m2g_geometry(std::string input_geometry) bool toDeclare = true; int lowestRank = std::numeric_limits::max(); - std::vector keysToCheck; + std::vector entitiesToCheck; procsSharedTo.clear(); //std::vector procsSharedTo; @@ -2361,11 +2361,10 @@ void MeshAdapt::initialize_m2g_geometry(std::string input_geometry) cur_node = bd->get_entity(stk::topology::NODE_RANK, shellNodeIDs[j]); - stk::mesh::EntityKey key = bd->entity_key(cur_node); - keysToCheck.push_back(key); + entitiesToCheck.push_back(cur_node); } - bd->shared_procs_intersection(keysToCheck, procsSharedTo); + bd->shared_procs_intersection(entitiesToCheck, procsSharedTo); procsSharedTo.push_back(THIS_PROC_NUM);//find all processes that either own or have these nodes shared to them for (size_t iii = 0; iii < procsSharedTo.size(); iii++) { if (procsSharedTo[iii] < lowestRank) diff --git a/packages/percept/src/percept/PerceptMesh.cpp b/packages/percept/src/percept/PerceptMesh.cpp index ed9b33802bd6..b1af440fd263 100644 --- a/packages/percept/src/percept/PerceptMesh.cpp +++ b/packages/percept/src/percept/PerceptMesh.cpp @@ -2142,9 +2142,12 @@ void PerceptMesh:: createEntities(stk::mesh::EntityRank entityRank, int count, std::vector& requested_entities) { - std::vector requests( m_metaData->entity_rank_count() , 0 ); - requests[entityRank] = count; - get_bulk_data()->generate_new_entities( requests, requested_entities ); + std::vector requestedIds; + get_bulk_data()->generate_new_ids(entityRank, count, requestedIds); + stk::mesh::PartVector addParts; + requested_entities.clear(); + get_bulk_data()->declare_entities(entityRank, requestedIds, addParts, requested_entities); + if (entityRank == node_rank()) { stk::mesh::Part& nodePart = get_fem_meta_data()->get_topology_root_part(stk::topology::NODE); @@ -7206,8 +7209,8 @@ { std::string K, V; for (YAML::const_iterator i = node.begin(); i != node.end(); ++i) { - const YAML::Node & key = i->first; - const YAML::Node & value = i->second; + const YAML::Node key = i->first; + const YAML::Node value = i->second; K = key.as(); V = value.as(); setProperty(K, V); diff --git a/packages/percept/src/percept/ShardsInterfaceTable.hpp b/packages/percept/src/percept/ShardsInterfaceTable.hpp index 0ff90232842a..08369accd227 100644 --- a/packages/percept/src/percept/ShardsInterfaceTable.hpp +++ b/packages/percept/src/percept/ShardsInterfaceTable.hpp @@ -25,7 +25,6 @@ #include #include -#include #include diff --git a/packages/percept/src/percept/YamlUtils.hpp b/packages/percept/src/percept/YamlUtils.hpp index 8ae8c6e16f0b..6d0f60baec2a 100644 --- a/packages/percept/src/percept/YamlUtils.hpp +++ b/packages/percept/src/percept/YamlUtils.hpp @@ -72,8 +72,8 @@ case YAML::NodeType::Map: emout << YAML::BeginMap ; for (YAML::const_iterator i = node.begin(); i != node.end(); ++i) { - const YAML::Node & key = i->first; - const YAML::Node & value = i->second; + const YAML::Node key = i->first; + const YAML::Node value = i->second; out = key.as(); emout << YAML::Key << out; emout << YAML::Value; diff --git a/packages/percept/src/percept/fixtures/BeamFixture.hpp b/packages/percept/src/percept/fixtures/BeamFixture.hpp index 26b213a1a617..97ece636ed53 100644 --- a/packages/percept/src/percept/fixtures/BeamFixture.hpp +++ b/packages/percept/src/percept/fixtures/BeamFixture.hpp @@ -21,7 +21,6 @@ #include #include -#include #include #include #include diff --git a/packages/percept/src/percept/fixtures/HeterogeneousFixture.hpp b/packages/percept/src/percept/fixtures/HeterogeneousFixture.hpp index 27b5136849dd..f4b4b75b8cbb 100644 --- a/packages/percept/src/percept/fixtures/HeterogeneousFixture.hpp +++ b/packages/percept/src/percept/fixtures/HeterogeneousFixture.hpp @@ -21,7 +21,6 @@ #include #include -#include #include #include diff --git a/packages/percept/src/percept/fixtures/PyramidFixture.hpp b/packages/percept/src/percept/fixtures/PyramidFixture.hpp index dbc5bdda5a4c..73d78e54a313 100644 --- a/packages/percept/src/percept/fixtures/PyramidFixture.hpp +++ b/packages/percept/src/percept/fixtures/PyramidFixture.hpp @@ -21,7 +21,6 @@ #include #include -#include #include #include diff --git a/packages/percept/src/percept/fixtures/QuadFixture.hpp b/packages/percept/src/percept/fixtures/QuadFixture.hpp index c048c32f88fb..11583bb60e16 100644 --- a/packages/percept/src/percept/fixtures/QuadFixture.hpp +++ b/packages/percept/src/percept/fixtures/QuadFixture.hpp @@ -32,7 +32,6 @@ #include -#include #include #include diff --git a/packages/percept/src/percept/fixtures/SingleTetFixture.hpp b/packages/percept/src/percept/fixtures/SingleTetFixture.hpp index 56ed34a69c7a..554e9ccf58ad 100644 --- a/packages/percept/src/percept/fixtures/SingleTetFixture.hpp +++ b/packages/percept/src/percept/fixtures/SingleTetFixture.hpp @@ -21,7 +21,6 @@ #include #include -#include #include #include diff --git a/packages/percept/src/percept/fixtures/TetWedgeFixture.hpp b/packages/percept/src/percept/fixtures/TetWedgeFixture.hpp index e6b2c7f8c926..f4e07767519a 100644 --- a/packages/percept/src/percept/fixtures/TetWedgeFixture.hpp +++ b/packages/percept/src/percept/fixtures/TetWedgeFixture.hpp @@ -21,7 +21,6 @@ #include #include -#include #include #include diff --git a/packages/percept/src/percept/fixtures/TriQuadSurfaceMesh3D.hpp b/packages/percept/src/percept/fixtures/TriQuadSurfaceMesh3D.hpp index c9df7c41fd89..960d4891258c 100644 --- a/packages/percept/src/percept/fixtures/TriQuadSurfaceMesh3D.hpp +++ b/packages/percept/src/percept/fixtures/TriQuadSurfaceMesh3D.hpp @@ -21,7 +21,6 @@ #include #include -#include #include #include diff --git a/packages/percept/src/percept/fixtures/WedgeFixture.hpp b/packages/percept/src/percept/fixtures/WedgeFixture.hpp index 333f5b5addd6..722f742c2f07 100644 --- a/packages/percept/src/percept/fixtures/WedgeFixture.hpp +++ b/packages/percept/src/percept/fixtures/WedgeFixture.hpp @@ -28,7 +28,6 @@ #include -#include #include #include diff --git a/packages/percept/src/percept/function/MDArray.hpp b/packages/percept/src/percept/function/MDArray.hpp index 2309f429e05b..b17779c3ee68 100644 --- a/packages/percept/src/percept/function/MDArray.hpp +++ b/packages/percept/src/percept/function/MDArray.hpp @@ -18,13 +18,6 @@ namespace percept { - // class MDArray : public FieldContainer - // { - // public: - // typedef FieldContainer base; - // MDArray(std::vector dimensions) : FieldContainer( Teuchos::Array(dimensions.begin(), dimensions.end()) ) {} - // }; - typedef Intrepid::FieldContainer MDArray; typedef Intrepid::FieldContainer MDArrayInt; typedef Intrepid::FieldContainer MDArrayUInt; @@ -73,7 +66,6 @@ } - //typedef Intrepid::FieldContainer MDArrayString; class MDArrayString { typedef std::vector VecOfString; diff --git a/packages/percept/src/percept/mesh/gen/SweepMesher.cpp b/packages/percept/src/percept/mesh/gen/SweepMesher.cpp index 2a103b03895d..c5a855ce0fc7 100644 --- a/packages/percept/src/percept/mesh/gen/SweepMesher.cpp +++ b/packages/percept/src/percept/mesh/gen/SweepMesher.cpp @@ -27,7 +27,6 @@ #include #include -#include #include #include diff --git a/packages/percept/src/percept/mesh/gen/SweepMesher.hpp b/packages/percept/src/percept/mesh/gen/SweepMesher.hpp index f3de5c9405af..0f8c308c6fdc 100644 --- a/packages/percept/src/percept/mesh/gen/SweepMesher.hpp +++ b/packages/percept/src/percept/mesh/gen/SweepMesher.hpp @@ -28,7 +28,6 @@ #include #include -#include #include #include diff --git a/packages/percept/src/percept/mesh/geometry/stk_geom/3D/FitGregoryPatches.cpp b/packages/percept/src/percept/mesh/geometry/stk_geom/3D/FitGregoryPatches.cpp index 76144f9e2a3b..b3ec57c70eb1 100644 --- a/packages/percept/src/percept/mesh/geometry/stk_geom/3D/FitGregoryPatches.cpp +++ b/packages/percept/src/percept/mesh/geometry/stk_geom/3D/FitGregoryPatches.cpp @@ -1916,8 +1916,8 @@ namespace percept { VERIFY_OP_ON(y_surface_set.Type(), ==, YAML::NodeType::Map, "bad surface_set data"); for (YAML::const_iterator i = y_surface_set.begin(); i != y_surface_set.end(); ++i) { - const YAML::Node & key = i->first; - const YAML::Node & value = i->second; + const YAML::Node key = i->first; + const YAML::Node value = i->second; std::string v_key; v_key = key.as(); VERIFY_OP_ON(value.Type(), ==, YAML::NodeType::Sequence, "bad surface_set value data in [surfaceSetName: [s1,s2...]]"); @@ -1938,8 +1938,8 @@ namespace percept { VERIFY_OP_ON(y_angle_map.Type(), ==, YAML::NodeType::Map, "bad angle_map data in yaml file"); for (YAML::const_iterator i = y_angle_map.begin(); i != y_angle_map.end(); ++i) { - const YAML::Node & key = i->first; - const YAML::Node & value = i->second; + const YAML::Node key = i->first; + const YAML::Node value = i->second; std::string v_key = key.as(); double v_value = value.as(); m_angleMap[v_key] = v_value; diff --git a/packages/percept/src/percept/mesh/mod/smoother/ReferenceMeshSmootherBase.cpp b/packages/percept/src/percept/mesh/mod/smoother/ReferenceMeshSmootherBase.cpp index 4d5353ad1c0c..1b002c752cf6 100644 --- a/packages/percept/src/percept/mesh/mod/smoother/ReferenceMeshSmootherBase.cpp +++ b/packages/percept/src/percept/mesh/mod/smoother/ReferenceMeshSmootherBase.cpp @@ -84,10 +84,7 @@ template ReferenceMeshSmootherBaseImpl:: ~ReferenceMeshSmootherBaseImpl() - { - if(Base::m_eMesh->get_rank() == 0) - myFile.close(); - } + {} template<> void ReferenceMeshSmootherBaseImpl::sync_fields(int iter) diff --git a/packages/percept/src/percept/verifier/mesh/Verifier.hpp b/packages/percept/src/percept/verifier/mesh/Verifier.hpp index 72051339524c..aab26ff527e4 100644 --- a/packages/percept/src/percept/verifier/mesh/Verifier.hpp +++ b/packages/percept/src/percept/verifier/mesh/Verifier.hpp @@ -30,7 +30,6 @@ #include #include -#include #include #include diff --git a/packages/percept/src/percept/xfer/LinInterp.hpp b/packages/percept/src/percept/xfer/LinInterp.hpp index c661c4b2b84e..663e4149901d 100644 --- a/packages/percept/src/percept/xfer/LinInterp.hpp +++ b/packages/percept/src/percept/xfer/LinInterp.hpp @@ -163,7 +163,7 @@ LinInterp::filter_to_nearest ( if (topo.getKey()==shards::Particle::key) { dist = 0.0; for ( unsigned j = 0; j < nDim; ++j ) { - dist += std::pow(cellWorkset(0,0,j) - inputPhysicalPoints(j), 2); + dist += std::pow(cellWorkset(0,0,j) - inputPhysicalPoints(0,j), 2); } dist = std::sqrt(dist); } @@ -177,7 +177,7 @@ LinInterp::filter_to_nearest ( topo, cellOrd); - dist = parametricDistanceToEntity(&outputParametricPoints(0), topo); + dist = parametricDistanceToEntity(&outputParametricPoints(0,0), topo); } if ( dist < (1.0 + parametric_tolerance) && dist < best_dist ) { @@ -185,7 +185,7 @@ LinInterp::filter_to_nearest ( best_dist = dist; for ( unsigned j = 0; j < nDim; ++j ) { - isoParCoords[j] = outputParametricPoints(j); + isoParCoords[j] = outputParametricPoints(0,j); } ToPoints.TransferInfo_[thePt] = isoParCoords; @@ -343,7 +343,6 @@ LinInterp::apply_from_nodal_field ( } } - Intrepid::FieldContainer outVals(1, 1); Intrepid::FieldContainer inputParametricPoints(1, nDim); inputParametricPoints.setValues(&isoParCoords[0], nDim); diff --git a/packages/seacas/cmake/FortranSettings.cmake b/packages/seacas/cmake/FortranSettings.cmake index c3447d57fe9e..7a73ce5f3128 100644 --- a/packages/seacas/cmake/FortranSettings.cmake +++ b/packages/seacas/cmake/FortranSettings.cmake @@ -8,6 +8,8 @@ IF ("${CMAKE_Fortran_COMPILER_ID}" MATCHES "GNU") SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fcray-pointer -fdefault-real-8 -fdefault-integer-8 -fno-range-check") ELSEIF ("${CMAKE_Fortran_COMPILER_ID}" MATCHES "XL") SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -WF,-D__XLF__ -qintsize=8 -qrealsize=8 -qfixed") +ELSEIF ("${CMAKE_Fortran_COMPILER_ID}" MATCHES "Cray") + SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -sdefault64") ELSE() SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -r8 -i8") ENDIF() diff --git a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp index d1c4317d9284..a8a8719341e7 100644 --- a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp @@ -45,6 +45,8 @@ //#include #include +#include + #include @@ -108,7 +110,7 @@ namespace FROSch { int buildGlobalBasisMatrix(ConstXMapPtr rowMap, ConstXMapPtr rangeMap, ConstXMapPtr repeatedMap, - SC treshold); + SC tresholdDropping); int clearCoarseSpace(); diff --git a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp index 3e7b05c5da51..ed391fab6a10 100644 --- a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp @@ -198,12 +198,12 @@ namespace FROSch { int CoarseSpace::buildGlobalBasisMatrix(ConstXMapPtr rowMap, ConstXMapPtr rangeMap, ConstXMapPtr repeatedMap, - SC treshold) + SC tresholdDropping) { FROSCH_ASSERT(!AssembledBasisMap_.is_null(),"FROSch::CoarseSpace: AssembledBasisMap_.is_null()."); FROSCH_ASSERT(!AssembledBasis_.is_null(),"FROSch::CoarseSpace: AssembledBasis_.is_null()."); - #if defined(HAVE_XPETRA_KOKKOS_REFACTOR) && defined(HAVE_XPETRA_TPETRA) +#if defined(HAVE_XPETRA_KOKKOS_REFACTOR) && defined(HAVE_XPETRA_TPETRA) if (rowMap->lib() == UseTpetra) { UN numRows = AssembledBasis_->getLocalLength(); UN numCols = AssembledBasis_->getNumVectors(); @@ -234,7 +234,7 @@ namespace FROSch { if (lo != -1) { for (UN j=0; j treshold) { + if (fabs(valueTmp) > tresholdDropping) { Rowptr[lo+1] ++; } } @@ -260,7 +260,7 @@ namespace FROSch { UN nnz_i = Rowptr[lo]; for (UN j=0; j treshold) { + if (fabs(valueTmp) > tresholdDropping) { Values[nnz_i] = valueTmp; Indices[nnz_i] = j; @@ -283,7 +283,7 @@ namespace FROSch { AssembledBasisMapUnique_, rangeMap, params); } else - #endif +#endif { if (rowMap->lib()==UseEpetra) { GlobalBasisMatrix_ = MatrixFactory::Build(rowMap,AssembledBasisMap_->getNodeNumElements()); // Nonzeroes abhängig von dim/dofs!!! @@ -294,7 +294,7 @@ namespace FROSch { SCVec values; for (UN j=0; jgetNumVectors(); j++) { valueTmp=AssembledBasis_->getData(j)[i]; - if (fabs(valueTmp)>treshold) { + if (fabs(valueTmp)>tresholdDropping) { indices.push_back(AssembledBasisMap_->getGlobalElement(j)); values.push_back(valueTmp); } @@ -314,7 +314,7 @@ namespace FROSch { SCVec values; for (UN j=0; jgetNumVectors(); j++) { valueTmp=AssembledBasis_->getData(j)[i]; - if (fabs(valueTmp)>treshold) { + if (fabs(valueTmp)>tresholdDropping) { indices.push_back(j); values.push_back(valueTmp); } diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_AlgebraicOverlappingOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_AlgebraicOverlappingOperator_def.hpp index b8fb721c5c76..2a9df6fbb913 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_AlgebraicOverlappingOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_AlgebraicOverlappingOperator_def.hpp @@ -160,8 +160,8 @@ namespace FROSch { this->OverlappingMap_ = repeatedMap; this->OverlappingMatrix_ = this->K_; - GO global; - LO local,sum,minVal,maxVal; + GO global,sum; + LO local,minVal,maxVal; SC avg; if (verbosity==All) { FROSCH_DETAILTIMER_START_LEVELID(printStatisticsTime,"print statistics"); @@ -172,7 +172,7 @@ namespace FROSch { } local = (LO) max((LO) this->OverlappingMap_->getNodeNumElements(),(LO) 0); - reduceAll(*this->MpiComm_,REDUCE_SUM,local,ptr(&sum)); + reduceAll(*this->MpiComm_,REDUCE_SUM,GO(local),ptr(&sum)); avg = max(sum/double(this->MpiComm_->getSize()),0.0); reduceAll(*this->MpiComm_,REDUCE_MIN,local,ptr(&minVal)); reduceAll(*this->MpiComm_,REDUCE_MAX,local,ptr(&maxVal)); @@ -231,7 +231,7 @@ namespace FROSch { if (verbosity==All) { FROSCH_DETAILTIMER_START_LEVELID(printStatisticsTime,"print statistics"); local = (LO) max((LO) this->OverlappingMap_->getNodeNumElements(),(LO) 0); - reduceAll(*this->MpiComm_,REDUCE_SUM,local,ptr(&sum)); + reduceAll(*this->MpiComm_,REDUCE_SUM,GO(local),ptr(&sum)); avg = max(sum/double(this->MpiComm_->getSize()),0.0); reduceAll(*this->MpiComm_,REDUCE_MIN,local,ptr(&minVal)); reduceAll(*this->MpiComm_,REDUCE_MAX,local,ptr(&maxVal)); diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp index 243d8e90b1d9..4a0b5c5323a4 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp @@ -89,7 +89,7 @@ namespace FROSch { if (CoarseSpace_->hasUnassembledMaps()) { // If there is no unassembled basis, the current Phi_ should already be correct CoarseSpace_->assembleCoarseSpace(); FROSCH_ASSERT(CoarseSpace_->hasAssembledBasis(),"FROSch::CoarseOperator : !CoarseSpace_->hasAssembledBasis()"); - CoarseSpace_->buildGlobalBasisMatrix(this->K_->getRowMap(),this->K_->getRangeMap(),subdomainMap,this->ParameterList_->get("Threshold Phi",1.e-8)); + CoarseSpace_->buildGlobalBasisMatrix(this->K_->getRowMap(),this->K_->getRangeMap(),subdomainMap,this->ParameterList_->get("Phi: Dropping Threshold",1.e-8)); FROSCH_ASSERT(CoarseSpace_->hasGlobalBasisMatrix(),"FROSch::CoarseOperator : !CoarseSpace_->hasGlobalBasisMatrix()"); Phi_ = CoarseSpace_->getGlobalBasisMatrix(); } @@ -106,6 +106,13 @@ namespace FROSch { this->ParameterList_->set("RCP(Phi)", Phi_); } + // Store current Coarse Matrix in ParameterList_ + if ( this->ParameterList_->get("Store Coarse Matrix",false) ) { + FROSCH_NOTIFICATION("FROSch::CoarseOperator",this->Verbose_,"Storing current Coarse Matrix in Parameterlist."); + this->ParameterList_->set("RCP(Coarse Matrix)", CoarseMatrix_); + this->ParameterList_->set("bool(CoarseSolveComm)", OnCoarseSolveComm_); + } + return 0; } @@ -605,8 +612,11 @@ namespace FROSch { #endif LO numProcsGatheringStep = this->MpiComm_->getSize(); - GO numGlobalIndices = coarseMapUnique->getMaxAllGlobalIndex()+1; - int numMyRows; + GO numGlobalIndices = coarseMapUnique->getMaxAllGlobalIndex(); + if (coarseMapUnique->lib()==UseEpetra || coarseMapUnique->getGlobalNumElements()>0) { + numGlobalIndices += 1; + } + LO numMyRows; double gatheringFactor = pow(double(this->MpiComm_->getSize())/double(NumProcsCoarseSolve_),1.0/double(gatheringSteps)); for (int i=0; iMpiComm_->getSize())/double(NumProcsCoarseSolve_),1.0/double(gatheringSteps)); - LO numProcsGatheringStep = this->MpiComm_->getSize(); - GO numGlobalIndices = CoarseMap_->getMaxAllGlobalIndex(); - GO numMyRows; - numMyRows = 0; - if (this->MpiComm_->getRank()%(this->MpiComm_->getSize()/NumProcsCoarseSolve_) == 0 && this->MpiComm_->getRank()/(this->MpiComm_->getSize()/NumProcsCoarseSolve_) < NumProcsCoarseSolve_) { - if (this->MpiComm_->getRank()==0) { - numMyRows = numGlobalIndices - (numGlobalIndices/NumProcsCoarseSolve_)*(NumProcsCoarseSolve_-1); - } else { - numMyRows = numGlobalIndices/NumProcsCoarseSolve_; - } - } - - XMapPtr tmpCoarseMap = Xpetra::MapFactory::Build(CoarseMap_->lib(),-1,numMyRows,0,this->MpiComm_); - if (tmpCoarseMap->getNodeNumElements()>0) { + // if (this->MpiComm_->getRank()==0) { + // numMyRows = numGlobalIndices - (numGlobalIndices/NumProcsCoarseSolve_)*(NumProcsCoarseSolve_-1); + // } else { + // numMyRows = numGlobalIndices/NumProcsCoarseSolve_; + // } OnCoarseSolveComm_=true; } + + // // XMapPtr tmpCoarseMap = Xpetra::MapFactory::Build(CoarseMap_->lib(),-1,numMyRows,0,this->MpiComm_); + // if (tmpCoarseMap->getNodeNumElements()>0) { + // OnCoarseSolveComm_=true; + // } CoarseSolveComm_ = this->MpiComm_->split(!OnCoarseSolveComm_,this->MpiComm_->getRank()); //Gathering Steps for RepeatedMap################################################# @@ -742,6 +747,7 @@ namespace FROSch { GO MLnumGlobalIndices = SubdomainConnectGraph_->getRowMap()->getMaxAllGlobalIndex()+1; GO MLnumMyRows; + LO numProcsGatheringStep = this->MpiComm_->getSize(); MLGatheringMaps_[0] = Xpetra::MapFactory::Build(this->K_->getMap()->lib(),-1,1,0,this->K_->getMap()->getComm()); for (int i=1; i::Build(CoarseMap_->lib(),-1,MLnumMyRows,0,this->MpiComm_); - } MLnumMyRows = 0; @@ -800,7 +805,11 @@ namespace FROSch { if (OnCoarseSolveComm_) { //Coarse DofsMaps so far only one Block will work ConstXMapPtrVecPtr2D CoarseDofsMaps(1); - FROSch::BuildRepMapZoltan(SubdomainConnectGraph_,ElementNodeList_, DistributionList_,MLCoarseMap_->getComm(),CoarseSolveRepeatedMap_); +#ifdef HAVE_SHYLU_DDFROSCH_ZOLTAN2 + BuildRepMapZoltan(SubdomainConnectGraph_,ElementNodeList_, DistributionList_,MLCoarseMap_->getComm(),CoarseSolveRepeatedMap_); +#else + ThrowErrorMissingPackage("FROSch::CoarseOperator","Zoltan2"); +#endif ConstRepMap = CoarseSolveRepeatedMap_; ConstXMapPtrVecPtr NodesMapVector(1); //MapVector for next Level @@ -838,7 +847,26 @@ namespace FROSch { sublist(sublist(sublist(this->ParameterList_,"CoarseSolver"),"FROSchPreconditioner"),"TwoLevelPreconditioner")->set("Nodes Map Vector",NodesMapVector); } - Teuchos::RCP > tmpMap = Xpetra::MapFactory::Build(CoarseMap_->lib(),-1,uniEle,0,this->MpiComm_); + GatheringMaps_[gatheringSteps-1] = Xpetra::MapFactory::Build(CoarseMap_->lib(),-1,uniEle,0,this->MpiComm_); + + GO numGlobalIndices = coarseMapUnique->getMaxAllGlobalIndex(); + if (coarseMapUnique->lib()==UseEpetra || coarseMapUnique->getGlobalNumElements()>0) { + numGlobalIndices += 1; + } + LO numMyRows; + double gatheringFactor = pow(double(this->MpiComm_->getSize())/double(NumProcsCoarseSolve_),1.0/double(gatheringSteps)); + + // + // double gatheringFactor = pow(double(this->MpiComm_->getSize())/double(NumProcsCoarseSolve_),1.0/double(gatheringSteps)); + // LO numProcsGatheringStep = this->MpiComm_->getSize(); + // GO numGlobalIndices = CoarseMap_->getMaxAllGlobalIndex(); + // + // LO numProcsGatheringStep = this->MpiComm_->getSize(); + // GO numGlobalIndices = coarseMapUnique->getMaxAllGlobalIndex(); + // if (coarseMapUnique->lib()==UseEpetra || coarseMapUnique->getGlobalNumElements()>0) { + // numGlobalIndices += 1; + // } + // GO numMyRows; for (int i=0; i::Build(CoarseMap_->lib(),-1,numMyRows,0,this->MpiComm_); } - GatheringMaps_[gatheringSteps-1] = tmpMap; - CoarseSolveMap_ = Xpetra::MapFactory::Build(CoarseMap_->lib(),-1,tmpMap->getNodeElementList(),0,CoarseSolveComm_); + + CoarseSolveMap_ = Xpetra::MapFactory::Build(CoarseMap_->lib(),-1,GatheringMaps_[gatheringSteps-1]->getNodeElementList(),0,CoarseSolveComm_); + } else if (!DistributionList_->get("Type","linear").compare("Zoltan2")) { #ifdef HAVE_SHYLU_DDFROSCH_ZOLTAN2 GatheringMaps_.resize(1); @@ -868,6 +897,88 @@ namespace FROSch { FROSCH_ASSERT(false,"FROSch::CoarseOperator: Distribution type unknown."); } + // Output information about the Gatherin Steps + GO global,sum,numRanks; + LO local,minVal,maxVal; + SC avg; + + global = coarseMapUnique->getMaxAllGlobalIndex(); + if (coarseMapUnique->lib()==UseEpetra || coarseMapUnique->getGlobalNumElements()>0) { + global += 1; + } + + local = (LO) max((LO) coarseMapUnique->getNodeNumElements(),(LO) 0); + reduceAll(*this->MpiComm_,REDUCE_SUM,GO(local),ptr(&sum)); + avg = max(sum/SC(this->MpiComm_->getSize()),0.0); + reduceAll(*this->MpiComm_,REDUCE_MIN,local,ptr(&minVal)); + reduceAll(*this->MpiComm_,REDUCE_MAX,local,ptr(&maxVal)); + + if (this->Verbose_) { + cout + << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " + << setw(89) << "-----------------------------------------------------------------------------------------" + << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " + << "| " + << left << setw(74) << "> Gathering Steps Statistics " << right << setw(8) << "(Level " << setw(2) << this->LevelID_ << ")" + << " |" + << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " + << setw(89) << "=========================================================================================" + << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " + << "| " << left << setw(7) << " " << right + << " | " << setw(10) << "ranks" + << " | " << setw(10) << "total" + << " | " << setw(10) << "avg" + << " | " << setw(10) << "min" + << " | " << setw(10) << "max" + << " | " << setw(10) << "global sum" + << " |" + << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " + << setw(89) << "-----------------------------------------------------------------------------------------" + << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " + << "| " << left << setw(4) << "Map " << setw(3) << "0" << right + << " | " << setw(10) << this->MpiComm_->getSize() + << " | " << setw(10) << global + << " | " << setw(10) << setprecision(5) << avg + << " | " << setw(10) << minVal + << " | " << setw(10) << maxVal + << " | " << setw(10) << sum + << " |"; + } + + for (int i=0; igetMaxAllGlobalIndex(); + if (GatheringMaps_[i]->lib()==UseEpetra || GatheringMaps_[i]->getGlobalNumElements()>0) { + global += 1; + } + + local = (LO) max((LO) GatheringMaps_[i]->getNodeNumElements(),(LO) 0); + reduceAll(*this->MpiComm_,REDUCE_SUM,GO(local),ptr(&sum)); + reduceAll(*this->MpiComm_,REDUCE_SUM,GO(GatheringMaps_[i]->getNodeNumElements()>0),ptr(&numRanks)); + avg = max(sum/SC(numRanks),0.0); + reduceAll(*this->MpiComm_,REDUCE_MIN,(GatheringMaps_[i]->getNodeNumElements()>0 ? local : numeric_limits::max()),ptr(&minVal)); + reduceAll(*this->MpiComm_,REDUCE_MAX,local,ptr(&maxVal)); + + if (this->Verbose_) { + cout + << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " + << "| " << setw(4) << left << "Map " << setw(3) << i+1 << right + << " | " << setw(10) << numRanks + << " | " << setw(10) << global + << " | " << setw(10) << setprecision(3) << avg + << " | " << setw(10) << minVal + << " | " << setw(10) << maxVal + << " | " << setw(10) << sum + << " |"; + } + } + + if (this->Verbose_) { + cout + << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " + << setw(89) << "-----------------------------------------------------------------------------------------" + << endl; + } + return 0; } diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_GDSWCoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_GDSWCoarseOperator_def.hpp index c0bb5c01d11c..427bdceeb4aa 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_GDSWCoarseOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_GDSWCoarseOperator_def.hpp @@ -548,39 +548,39 @@ namespace FROSch { << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " << setw(89) << "=========================================================================================" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "Vertices " << " | " << setw(19) << " Translations" << right + << "| " << left << setw(19) << "Vertices " << " | " << setw(19) << "Translations " << right << " | " << setw(41) << boolalpha << useVertexTranslations << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "ShortEdges " << " | " << setw(19) << " Translations" << right + << "| " << left << setw(19) << "ShortEdges " << " | " << setw(19) << "Translations " << right << " | " << setw(41) << boolalpha << useShortEdgeTranslations << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "ShortEdges " << " | " << setw(19) << " Rotations" << right + << "| " << left << setw(19) << "ShortEdges " << " | " << setw(19) << "Rotations " << right << " | " << setw(41) << boolalpha << useShortEdgeRotations << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "StraightEdges " << " | " << setw(19) << " Translations" << right + << "| " << left << setw(19) << "StraightEdges " << " | " << setw(19) << "Translations " << right << " | " << setw(41) << boolalpha << useStraightEdgeTranslations << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "StraightEdges " << " | " << setw(19) << " Rotations" << right + << "| " << left << setw(19) << "StraightEdges " << " | " << setw(19) << "Rotations " << right << " | " << setw(41) << boolalpha << useStraightEdgeRotations << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "Edges " << " | " << setw(19) << " Translations" << right + << "| " << left << setw(19) << "Edges " << " | " << setw(19) << "Translations " << right << " | " << setw(41) << boolalpha << useEdgeTranslations << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "Edges " << " | " << setw(19) << " Rotations" << right + << "| " << left << setw(19) << "Edges " << " | " << setw(19) << "Rotations " << right << " | " << setw(41) << boolalpha << useEdgeRotations << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "Faces " << " | " << setw(19) << " Translations" << right + << "| " << left << setw(19) << "Faces " << " | " << setw(19) << "Translations " << right << " | " << setw(41) << boolalpha << useFaceTranslations << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "Faces " << " | " << setw(19) << " Rotations" << right + << "| " << left << setw(19) << "Faces " << " | " << setw(19) << "Rotations " << right << " | " << setw(41) << boolalpha << useFaceRotations << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp index 5d833762ca7d..0b1255b647c0 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp @@ -179,7 +179,8 @@ namespace FROSch { ConstXMapPtr rowMap, ConstXMapPtr rangeMap, ConstXMapPtr repeatedMap, - SC treshold); + SC tresholdDropping, + SC tresholdOrthogonalization); virtual XMultiVectorPtr computeExtensions(ConstXMapPtr localMap, GOVecView indicesGammaDofsAll, diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp index 6f38352159e0..87ddaddbe5d6 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp @@ -102,7 +102,7 @@ namespace FROSch { //Detect linear dependencies if (!this->ParameterList_->get("Skip DetectLinearDependencies",false)) { - LOVecPtr linearDependentVectors = detectLinearDependencies(indicesGammaDofsAll(),this->K_->getRowMap(),this->K_->getRangeMap(),repeatedMap,this->ParameterList_->get("Threshold Phi",1.e-8)); + LOVecPtr linearDependentVectors = detectLinearDependencies(indicesGammaDofsAll(),this->K_->getRowMap(),this->K_->getRangeMap(),repeatedMap,this->ParameterList_->get("Phi: Dropping Threshold",1.e-8),this->ParameterList_->get("Phi: Orthogonalization Threshold",1.e-12)); // cout << this->MpiComm_->getRank() << " " << linearDependentVectors.size() << endl; AssembledInterfaceCoarseSpace_->zeroOutBasisVectors(linearDependentVectors()); } @@ -263,11 +263,11 @@ namespace FROSch { << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " << setw(89) << "=========================================================================================" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "Volumes " << " | " << setw(19) << " Translations" << right + << "| " << left << setw(19) << "Volumes " << " | " << setw(19) << "Translations " << right << " | " << setw(41) << boolalpha << useForCoarseSpace << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "Volumes " << " | " << setw(19) << " Rotations" << right + << "| " << left << setw(19) << "Volumes " << " | " << setw(19) << "Rotations " << right << " | " << setw(41) << boolalpha << useRotations << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " @@ -491,7 +491,8 @@ namespace FROSch { ConstXMapPtr rowMap, ConstXMapPtr rangeMap, ConstXMapPtr repeatedMap, - SC treshold) + SC tresholdDropping, + SC tresholdOrthogonalization) { FROSCH_DETAILTIMER_START_LEVELID(detectLinearDependenciesTime,"HarmonicCoarseOperator::detectLinearDependencies"); LOVecPtr linearDependentVectors(AssembledInterfaceCoarseSpace_->getBasisMap()->getNodeNumElements()); //if (this->Verbose_) cout << AssembledInterfaceCoarseSpace_->getAssembledBasis()->getNumVectors() << " " << AssembledInterfaceCoarseSpace_->getAssembledBasis()->getLocalLength() << " " << indicesGammaDofsAll.size() << endl; @@ -499,6 +500,16 @@ namespace FROSch { //Construct matrix phiGamma XMatrixPtr phiGamma = MatrixFactory::Build(rowMap,AssembledInterfaceCoarseSpace_->getBasisMap()->getNodeNumElements()); + // Array for scaling the columns of PhiGamma (1/norm(PhiGamma(:,i))) + SCVec scale(AssembledInterfaceCoarseSpace_->getAssembledBasis()->getNumVectors(),0.0); + for (UN i = 0; i < AssembledInterfaceCoarseSpace_->getAssembledBasis()->getNumVectors(); i++) { + ConstSCVecPtr assembledInterfaceCoarseSpaceData = AssembledInterfaceCoarseSpace_->getAssembledBasis()->getData(i); + for (UN j = 0; j < AssembledInterfaceCoarseSpace_->getAssembledBasis()->getLocalLength(); j++) { + scale[i] += assembledInterfaceCoarseSpaceData[j]*assembledInterfaceCoarseSpaceData[j]; + } + scale[i] = 1.0/sqrt(scale[i]); + } + LO iD; SC valueTmp; for (UN i=0; igetAssembledBasis()->getLocalLength(); i++) { @@ -506,9 +517,9 @@ namespace FROSch { SCVec values; for (UN j=0; jgetAssembledBasis()->getNumVectors(); j++) { valueTmp=AssembledInterfaceCoarseSpace_->getAssembledBasis()->getData(j)[i]; - if (fabs(valueTmp)>treshold) { + if (fabs(valueTmp)>tresholdDropping) { indices.push_back(AssembledInterfaceCoarseSpace_->getBasisMap()->getGlobalElement(j)); - values.push_back(valueTmp); + values.push_back(valueTmp*scale[j]); } } iD = repeatedMap->getGlobalElement(indicesGammaDofsAll[i]); @@ -560,11 +571,16 @@ namespace FROSch { TSerialDenseMatrixPtr r = qRSolver->getR(); LO tmp = 0; for (LO i=0; inumRows(); i++) { - SC normRow = 0.0; - for (LO j=0; jnumCols(); j++) { - normRow += (*r)(i,j)*(*r)(i,j); - } - if (sqrt(normRow)numCols(); j++) { + // normRow += (*r)(i,j)*(*r)(i,j); + // } + // if (sqrt(normRow)MpiComm_->getRank() << " " << i << " " << AssembledInterfaceCoarseSpace_->getBasisMap()->getGlobalElement(i) << " " << sqrt(normRow) << std::endl; + // linearDependentVectors[tmp] = i; + // tmp++; + // } + if (fabs((*r)(i,i))MpiComm_->getRank() << " " << i << " " << AssembledInterfaceCoarseSpace_->getBasisMap()->getGlobalElement(i) << " " << sqrt(normRow) << std::endl; linearDependentVectors[tmp] = i; tmp++; diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_RGDSWCoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_RGDSWCoarseOperator_def.hpp index 6752e5eb2dea..a0456f38d271 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_RGDSWCoarseOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_RGDSWCoarseOperator_def.hpp @@ -227,11 +227,11 @@ namespace FROSch { << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " << setw(89) << "=========================================================================================" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "Coarse nodes " << " | " << setw(19) << " Translations" << right + << "| " << left << setw(19) << "Coarse nodes " << " | " << setw(19) << "Translations " << right << " | " << setw(41) << boolalpha << useForCoarseSpace << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " - << "| " << left << setw(20) << "Coarse nodes " << " | " << setw(19) << " Rotations" << right + << "| " << left << setw(19) << "Coarse nodes " << " | " << setw(19) << "Rotations " << right << " | " << setw(41) << boolalpha << useRotations << noboolalpha << " |" << "\n" << setw(FROSCH_OUTPUT_INDENT) << " " diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_decl.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_decl.hpp index dc953563acec..05be7eb645fa 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_decl.hpp @@ -49,6 +49,7 @@ #include #include +#include #include diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_def.hpp index ec275a55609f..05bda9f7f47c 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_def.hpp @@ -55,7 +55,7 @@ namespace FROSch { MpiComm_ (comm), Verbose_ (comm->getRank()==0) { - + } template @@ -122,10 +122,11 @@ namespace FROSch { template void SchwarzOperator::residual(const XMultiVector & X, const XMultiVector & B, - XMultiVector& R) const { - SC one = Teuchos::ScalarTraits::one(), negone = -one; - apply(X,R); - R.update(one,B,negone); + XMultiVector& R) const + { + SC one = ScalarTraits::one(), negone = -one; + apply(X,R); + R.update(one,B,negone); } } diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzPreconditioners/FROSch_TwoLevelBlockPreconditioner_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzPreconditioners/FROSch_TwoLevelBlockPreconditioner_def.hpp index 24ddaa7681a6..40c130de7106 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzPreconditioners/FROSch_TwoLevelBlockPreconditioner_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzPreconditioners/FROSch_TwoLevelBlockPreconditioner_def.hpp @@ -61,17 +61,17 @@ namespace FROSch { // Set the LevelID in the sublist parameterList->sublist("IPOUHarmonicCoarseOperator").set("Level ID",this->LevelID_); // FROSCH_ASSERT(false,"not implemented for block."); - this->ParameterList_->sublist("IPOUHarmonicCoarseOperator").sublist("InterfacePartitionOfUnity").set("Test Unconnected Interface",false); + this->ParameterList_->sublist("IPOUHarmonicCoarseOperator").sublist("InterfacePartitionOfUnity").set("Test Unconnected Interface",true); CoarseOperator_ = IPOUHarmonicCoarseOperatorPtr(new IPOUHarmonicCoarseOperator(k,sublist(parameterList,"IPOUHarmonicCoarseOperator"))); } else if (!this->ParameterList_->get("CoarseOperator Type","IPOUHarmonicCoarseOperator").compare("GDSWCoarseOperator")) { // Set the LevelID in the sublist parameterList->sublist("GDSWCoarseOperator").set("Level ID",this->LevelID_); - this->ParameterList_->sublist("GDSWCoarseOperator").set("Test Unconnected Interface",false); + this->ParameterList_->sublist("GDSWCoarseOperator").set("Test Unconnected Interface",true); CoarseOperator_ = GDSWCoarseOperatorPtr(new GDSWCoarseOperator(k,sublist(parameterList,"GDSWCoarseOperator"))); } else if (!this->ParameterList_->get("CoarseOperator Type","IPOUHarmonicCoarseOperator").compare("RGDSWCoarseOperator")) { // Set the LevelID in the sublist parameterList->sublist("RGDSWCoarseOperator").set("Level ID",this->LevelID_); - this->ParameterList_->sublist("RGDSWCoarseOperator").set("Test Unconnected Interface",false); + this->ParameterList_->sublist("RGDSWCoarseOperator").set("Test Unconnected Interface",true); CoarseOperator_ = RGDSWCoarseOperatorPtr(new RGDSWCoarseOperator(k,sublist(parameterList,"RGDSWCoarseOperator"))); } else { FROSCH_ASSERT(false,"CoarseOperator Type unkown."); diff --git a/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraPreconditioner_def.hpp b/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraPreconditioner_def.hpp index ea897ab9361f..a5e28b1c7723 100644 --- a/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraPreconditioner_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraPreconditioner_def.hpp @@ -92,6 +92,11 @@ namespace FROSch { } ThyraPreconditioner_->getUnspecifiedPrecOp()->apply(tMode,*xThyra,yThyra.ptr(),alpha,beta); + + // It seems that we have to convert the Thyra vector back to Xpetra. Is there a cheaper/more elegant way? + // Same for ThyraSolver + XMultiVectorPtr yXpetra = ThyraUtils::toXpetra(yThyra,y.getMap()->getComm()); + y = *yXpetra; } template diff --git a/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraSolver_def.hpp b/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraSolver_def.hpp index 060a6b35f76e..ef94aba9d78a 100644 --- a/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraSolver_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraSolver_def.hpp @@ -93,6 +93,12 @@ namespace FROSch { } SolveStatus status = solve(*ThyraSolver_,tMode,*xThyra,YT_.ptr()); + + // It seems that we have to convert the Thyra vector back to Xpetra. Is there a cheaper/more elegant way? + // Same for ThyraPreconditioner + XMultiVectorPtr yXpetra = ThyraUtils::toXpetra(YT_,y.getMap()->getComm()); + y = *yXpetra; + y.update(alpha,*YX_,beta); } diff --git a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Output.h b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Output.h index f5f61805487d..f9c8d97f44c7 100644 --- a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Output.h +++ b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Output.h @@ -52,22 +52,22 @@ #ifndef FROSCH_ASSERT #define FROSCH_ASSERT(COND,MSG) \ { \ - const bool throw_exception = !(COND); \ - if(throw_exception) { \ - Teuchos::TestForException_incrThrowNumber(); \ - std::ostringstream omsg; \ - omsg \ - << std::setw(FROSCH_OUTPUT_INDENT) << " " << __FILE__ << ":" << __LINE__ << ":\n\n" \ - << "Throw number = " << Teuchos::TestForException_getThrowNumber() \ - << "\n\n" \ - << std::setw(FROSCH_OUTPUT_INDENT) << " " << "Throw test that evaluated to true: "#COND \ - << "\n\n" \ - << std::setw(FROSCH_OUTPUT_INDENT) << " " << "[ERROR] " << MSG; \ - const std::string &omsgstr = omsg.str(); \ - TEUCHOS_STORE_STACKTRACE(); \ - Teuchos::TestForException_break(omsgstr); \ - throw std::logic_error(omsgstr); \ - } \ + const bool throw_exception = !(COND); \ + if(throw_exception) { \ + Teuchos::TestForException_incrThrowNumber(); \ + std::ostringstream omsg; \ + omsg \ + << std::setw(FROSCH_OUTPUT_INDENT) << " " << __FILE__ << ":" << __LINE__ << ":\n\n" \ + << "Throw number = " << Teuchos::TestForException_getThrowNumber() \ + << "\n\n" \ + << std::setw(FROSCH_OUTPUT_INDENT) << " " << "Throw test that evaluated to true: "#COND \ + << "\n\n" \ + << std::setw(FROSCH_OUTPUT_INDENT) << " " << "[ERROR] " << MSG; \ + const std::string &omsgstr = omsg.str(); \ + TEUCHOS_STORE_STACKTRACE(); \ + Teuchos::TestForException_break(omsgstr); \ + throw std::logic_error(omsgstr); \ + } \ } #endif diff --git a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_decl.hpp b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_decl.hpp index dc147819f3bb..b5a0095c3526 100644 --- a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_decl.hpp @@ -195,6 +195,13 @@ namespace FROSch { template void readMM(std::string fileName, Teuchos::RCP > &matrix_,RCP > &comm); + template + RCP > BuildRepeatedMapGaleriStruct2D(RCP > matrix,int M,int Dim); + + + template + RCP > BuildRepeatedMapGaleriStruct3D(RCP > matrix,int M,int Dim); + template RCP > BuildUniqueMap(const RCP > map, bool useCreateOneToOneMap = true, diff --git a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_def.hpp b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_def.hpp index 6d81c41d0903..52360fc17016 100644 --- a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_def.hpp @@ -243,6 +243,191 @@ namespace FROSch { matrix_ = rcp_dynamic_cast >(tmpMatrix); } + template + RCP > BuildRepeatedMapGaleriStruct2D(RCP > matrix,int M,int Dim) + { + Teuchos::ArrayView< const GO> eleList; + eleList = matrix->getMap()->getNodeElementList(); + Teuchos::RCP< const Teuchos::Comm< int > > Comm = matrix->getMap()->getComm(); + + int size = Comm->getSize(); + int rank = Comm->getRank(); + + Teuchos::Array vert; + vert.reserve(M*Dim); + Teuchos::Array horz; + horz.reserve((M+1)*Dim); + int numSubPerRow = sqrt(size); + GO nodesInRow = M*Dim*numSubPerRow; + Teuchos::Array newEle; + newEle.reserve(eleList.size()+M*Dim+(M+1)*Dim); + int count = 0; + for (int i = 0;i::Build(matrix->getMap()->lib(),matrix->getMap()->getGlobalNumElements(),newEle(),0,Comm); + + } + + template + RCP > BuildRepeatedMapGaleriStruct3D(RCP > matrix,int M,int Dim) + { + + FROSCH_DETAILTIMER_START(Galeri3DMap,"BuildGeometricMap3D"); + + Teuchos::ArrayView< const GO> eleList; + eleList = matrix->getNodeElementList(); + Teuchos::RCP< const Teuchos::Comm< int > > Comm = matrix->getComm(); + + int size = Comm->getSize(); + int rank = Comm->getRank(); + + Teuchos::Array vert; + vert.reserve(M*Dim); + Teuchos::Array horz; + horz.reserve((M+1)*Dim); + int numSubPerRow = std::pow(size,1/3.)+0.7; + //int numSubPerRow = numSubPerRow1; + // numSubPerRow = numSubPerRow+1; + //if(Comm->getRank() == 0) std::cout<<"Size "<getRank() == 0) std::cout<<"numSubPerRow1 "<getRank() == 0) std::cout<<"subInLev "<getRank() == 0) std::cout<<"nodesInRow "<getRank() == 0) std::cout<<"nodesInLev "< newEle; + newEle.reserve(eleList.size()+M*Dim+(M+1)*Dim); + GO startval = eleList[0]/Dim; + + //Differentiate between locations of the sub + //not back + if (rank::Build(matrix->getMap()->lib(),matrix->getMap()->getGlobalNumElements(),newEle(),0,Comm); + + } + template RCP > BuildUniqueMap(const RCP > map, bool useCreateOneToOneMap, @@ -1032,8 +1217,8 @@ namespace FROSch { template ArrayRCP > > BuildNodeMapsFromDofMaps(ArrayRCP > > > dofsMapsVecVec, - ArrayRCP dofsPerNodeVec, - ArrayRCP dofOrderingVec) + ArrayRCP dofsPerNodeVec, + ArrayRCP dofOrderingVec) { typedef Map Map; @@ -1107,8 +1292,7 @@ namespace FROSch { } nodeMapsVec[block] = MapFactory::Build( dofsMapsVecVec[block][0]->lib(), -1,globalIndicesNode(), 0, dofsMapsVecVec[block][0]->getComm() ); - } - else{ //DimensionWise + } else { //DimensionWise GO minGID = dofsMapsVecVec[block][0]->getMinAllGlobalIndex(); ArrayView< const GO > globalIndices = dofsMapsVecVec[block][0]->getNodeElementList(); Array globalIndicesNode( globalIndices ); diff --git a/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Elasticity/main.cpp b/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Elasticity/main.cpp index 10bd24bfd97f..8a834b5178c5 100644 --- a/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Elasticity/main.cpp +++ b/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Elasticity/main.cpp @@ -125,7 +125,8 @@ int main(int argc, char *argv[]) My_CLP.setOption("PLIST",&xmlFile,"File name of the parameter list."); bool useepetra = false; My_CLP.setOption("USEEPETRA","USETPETRA",&useepetra,"Use Epetra infrastructure for the linear algebra."); - + bool useGeoMap = false; + My_CLP.setOption("useGeoMap","useAlgMap",&useGeoMap,"Use Geometric Map"); My_CLP.recogniseAllOptions(true); My_CLP.throwExceptions(false); CommandLineProcessor::EParseCommandLineReturn parseReturn = My_CLP.parse(argc,argv); @@ -200,7 +201,25 @@ int main(int argc, char *argv[]) RCP,CrsMatrixWrap,MultiVector > > Problem = Galeri::Xpetra::BuildProblem,CrsMatrixWrap,MultiVector >("Elasticity3D",UniqueMap,GaleriList); K = Problem->BuildMatrix(); } - RCP > RepeatedMap = BuildRepeatedMapNonConst(K->getCrsGraph()); + + + RCP > FullRepeatedMap; + RCP > RepeatedMap; + RCP > FullRepeatedMapNode; + if (useGeoMap) { + if (Dimension == 2) { + FullRepeatedMap = BuildRepeatedMapGaleriStruct2D(K,M,Dimension); + RepeatedMap = FullRepeatedMap; + } else if (Dimension == 3) { + FullRepeatedMapNode = BuildRepeatedMapGaleriStruct3D(K->getMap(),M,Dimension); + FullRepeatedMap = BuildMapFromNodeMap(FullRepeatedMapNode,Dimension,NodeWise); + //FullRepeatedMapNode->describe(*fancy,Teuchos::VERB_EXTREME); + RepeatedMap = FullRepeatedMap; + } + } else { + RepeatedMap = BuildRepeatedMapNonConst(K->getCrsGraph()); + } + RCP > xSolution = MultiVectorFactory::Build(UniqueMap,1); RCP > xRightHandSide = MultiVectorFactory::Build(UniqueMap,1); @@ -214,7 +233,7 @@ int main(int argc, char *argv[]) RCP >thyraB = ThyraUtils::toThyraMultiVector(xRightHandSide); //-----------Set Coordinates and RepMap in ParameterList-------------------------- - RCP plList = sublist(parameterList,"Preconditioner Types"); + RCP plList = sublist(parameterList,"Preconditioner Types"); sublist(plList,"FROSch")->set("Dimension",Dimension); sublist(plList,"FROSch")->set("Overlap",Overlap); sublist(plList,"FROSch")->set("DofOrdering","NodeWise"); diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp index ef87eaf00942..9730acaf0525 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp @@ -524,7 +524,7 @@ namespace Tacho { track_free(_factorize_mode.span()*sizeof(ordinal_type)); track_free(_solve_mode.span()*sizeof(ordinal_type)); track_free(_level_sids.span()*sizeof(ordinal_type)); - if (verbose || true) { + if (verbose) { printf("Summary: LevelSetTools-Variant-%d (Release)\n", variant); printf("============================================\n"); print_stat_memory(); @@ -616,7 +616,7 @@ namespace Tacho { for (ordinal_type i=0;i<_nstreams;++i) { ExecSpaceFactory::createInstance(_cuda_streams[i], _exec_instances[i]); } - if (verbose || true) { + if (verbose) { printf("Summary: CreateStream : %3d\n", _nstreams); printf("===========================\n"); } @@ -1726,7 +1726,18 @@ namespace Tacho { const ordinal_type half_level = _nlevel/2; //const ordinal_type team_size_factor[2] = { 64, 16 }, vector_size_factor[2] = { 8, 8}; //const ordinal_type team_size_factor[2] = { 16, 16 }, vector_size_factor[2] = { 32, 32}; +#if defined (CUDA_VERSION) +#if (11000 > CUDA_VERSION) + /// cuda 11.1 below + const ordinal_type team_size_factor[2] = { 32, 64 }, vector_size_factor[2] = { 8, 4}; +#else + /// cuda 11.1 and higher + const ordinal_type team_size_factor[2] = { 64, 64 }, vector_size_factor[2] = { 8, 4}; +#endif +#else + /// not cuda ... whatever.. const ordinal_type team_size_factor[2] = { 64, 64 }, vector_size_factor[2] = { 8, 4}; +#endif const ordinal_type team_size_update[2] = { 16, 8 }, vector_size_update[2] = { 32, 32}; { typedef TeamFunctor_FactorizeLDL functor_type; @@ -1848,7 +1859,18 @@ namespace Tacho { #endif // this should be considered with average problem sizes in levels const ordinal_type half_level = _nlevel/2; +#if defined (CUDA_VERSION) +#if (11000 > CUDA_VERSION) + /// cuda 11.1 below + const ordinal_type team_size_solve[2] = { 32, 16 }, vector_size_solve[2] = { 8, 8}; +#else + /// cuda 11.1 and higher + const ordinal_type team_size_solve[2] = { 32, 16 }, vector_size_solve[2] = { 8, 8}; +#endif +#else + /// not cuda whatever... const ordinal_type team_size_solve[2] = { 64, 16 }, vector_size_solve[2] = { 8, 8}; +#endif const ordinal_type team_size_update[2] = { 128, 32}, vector_size_update[2] = { 1, 1}; { typedef TeamFunctor_SolveLowerLDL functor_type; diff --git a/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixMPVectorUnitTest.hpp b/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixMPVectorUnitTest.hpp index a6d2b21b2b61..b3f44635b715 100644 --- a/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixMPVectorUnitTest.hpp +++ b/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixMPVectorUnitTest.hpp @@ -55,6 +55,7 @@ #include "Tpetra_Vector.hpp" #include "Tpetra_CrsGraph.hpp" #include "Tpetra_CrsMatrix.hpp" +#include "Tpetra_Details_WrappedDualView.hpp" #include "Stokhos_Tpetra_CG.hpp" // Belos solver @@ -1001,6 +1002,46 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( } } +// +// Test interaction between Tpetra WrappedDualView and MP::Vector +// +TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( + Tpetra_CrsMatrix_MP, WrappedDualView, Storage, LocalOrdinal, GlobalOrdinal, Node ) +{ + //BMK 6-2021: This test is required because a View of MP::Vector has slightly different behavior than a typical Kokkos::View. + //If you construct a Kokkos::View with a label and 0 extent, it gets a non-null allocation. + //But for View, the same constructor produces a null data pointer but + //an active reference counting node (use_count() > 0). + //This test makes sure that Tpetra WrappedDualView works correctly with a View where data() == nullptr but use_count() > 0. + using Teuchos::RCP; + using Teuchos::rcp; + using Teuchos::ArrayView; + using Teuchos::Array; + using Teuchos::ArrayRCP; + + typedef typename Storage::value_type BaseScalar; + typedef Sacado::MP::Vector Scalar; + + using DualViewType = Kokkos::DualView; + using WDV = Tpetra::Details::WrappedDualView; + using values_view = typename DualViewType::t_dev; + + // Ensure device is initialized + if ( !Kokkos::is_initialized() ) + Kokkos::initialize(); + + WDV wdv; + { + values_view myView("emptyTestView", 0); + wdv = WDV(myView); + } + size_t use_h = wdv.getHostView(Tpetra::Access::ReadOnly).use_count(); + size_t use_d = wdv.getDeviceView(Tpetra::Access::ReadOnly).use_count(); + //The WrappedDualView is now the only object holding references to the host and device views, + //so they should have identical use counts. + TEST_EQUALITY(use_h, use_d); +} + // // Test simple CG solve without preconditioning for a 1-D Laplacian matrix // @@ -2448,6 +2489,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, MultiVectorDotSub, S, LO, GO, N ) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, MatrixVectorMultiply, S, LO, GO, N ) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, MatrixMultiVectorMultiply, S, LO, GO, N ) \ + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, WrappedDualView, S, LO, GO, N ) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, Flatten, S, LO, GO, N ) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, SimpleCG, S, LO, GO, N ) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, SimplePCG_Muelu, S, LO, GO, N ) \ diff --git a/packages/tempus/src/Tempus_Integrator.hpp b/packages/tempus/src/Tempus_Integrator.hpp index 05e6e5d80d4f..4dbcd4a2f79d 100644 --- a/packages/tempus/src/Tempus_Integrator.hpp +++ b/packages/tempus/src/Tempus_Integrator.hpp @@ -82,6 +82,8 @@ class Integrator virtual void setTempusParameterList(Teuchos::RCP pl) = 0; /// Returns the SolutionHistory for this Integrator virtual Teuchos::RCP > getSolutionHistory() const = 0; + /// Returns the SolutionHistory for this Integrator + virtual Teuchos::RCP > getNonConstSolutionHistory() = 0; /// Returns the TimeStepControl for this Integrator virtual Teuchos::RCP > getTimeStepControl() const = 0; virtual Teuchos::RCP > getNonConstTimeStepControl() = 0; diff --git a/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_decl.hpp b/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_decl.hpp index b8d3479e7ac6..7780cb750c7f 100644 --- a/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_decl.hpp +++ b/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_decl.hpp @@ -108,6 +108,8 @@ class IntegratorAdjointSensitivity : virtual void setTempusParameterList(Teuchos::RCP pl) override; /// Get the SolutionHistory virtual Teuchos::RCP > getSolutionHistory() const override; + /// Get the SolutionHistory + virtual Teuchos::RCP > getNonConstSolutionHistory() override; /// Get the TimeStepControl virtual Teuchos::RCP > getTimeStepControl() const override; virtual Teuchos::RCP > getNonConstTimeStepControl() override; diff --git a/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_impl.hpp b/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_impl.hpp index 4339983292cd..b33e26d13289 100644 --- a/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_impl.hpp +++ b/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_impl.hpp @@ -280,6 +280,14 @@ getSolutionHistory() const return solutionHistory_; } +template +Teuchos::RCP > +IntegratorAdjointSensitivity:: +getNonConstSolutionHistory() +{ + return solutionHistory_; +} + template Teuchos::RCP > IntegratorAdjointSensitivity:: @@ -376,12 +384,13 @@ describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { - auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); l_out->setOutputToRootOnly(0); + *l_out << description() << "::describe" << std::endl; - state_integrator_->describe(out, verbLevel); - adjoint_integrator_->describe(out, verbLevel); + state_integrator_->describe(*l_out, verbLevel); + adjoint_integrator_->describe(*l_out, verbLevel); } template diff --git a/packages/tempus/src/Tempus_IntegratorBasic.cpp b/packages/tempus/src/Tempus_IntegratorBasic.cpp index 7607f8f8bde2..bbfc6b59bf73 100644 --- a/packages/tempus/src/Tempus_IntegratorBasic.cpp +++ b/packages/tempus/src/Tempus_IntegratorBasic.cpp @@ -16,6 +16,10 @@ namespace Tempus { TEMPUS_INSTANTIATE_TEMPLATE_CLASS(IntegratorBasic) + // Nonmember ctor + template Teuchos::RCP > createIntegratorBasic( + Teuchos::RCP parameterList); + // Nonmember ctor template Teuchos::RCP > createIntegratorBasic( Teuchos::RCP parameterList, diff --git a/packages/tempus/src/Tempus_IntegratorBasicOld_decl.hpp b/packages/tempus/src/Tempus_IntegratorBasicOld_decl.hpp index 5f1d13677a05..b28288b2fae4 100644 --- a/packages/tempus/src/Tempus_IntegratorBasicOld_decl.hpp +++ b/packages/tempus/src/Tempus_IntegratorBasicOld_decl.hpp @@ -111,6 +111,9 @@ class IntegratorBasicOld /// Get the SolutionHistory virtual Teuchos::RCP > getSolutionHistory() const override {return solutionHistory_;} + /// Get the SolutionHistory + virtual Teuchos::RCP > getNonConstSolutionHistory() override + {return solutionHistory_;} /// Set the SolutionHistory virtual void setSolutionHistory( Teuchos::RCP > sh = Teuchos::null); diff --git a/packages/tempus/src/Tempus_IntegratorBasicOld_impl.hpp b/packages/tempus/src/Tempus_IntegratorBasicOld_impl.hpp index 3c73274007b2..68c0379e70ec 100644 --- a/packages/tempus/src/Tempus_IntegratorBasicOld_impl.hpp +++ b/packages/tempus/src/Tempus_IntegratorBasicOld_impl.hpp @@ -387,8 +387,8 @@ bool IntegratorBasicOld::advanceTime() startIntegrator(); integratorObserver_->observeStartIntegrator(*this); - while (integratorStatus_ == WORKING and - timeStepControl_->timeInRange (solutionHistory_->getCurrentTime()) and + while (integratorStatus_ == WORKING && + timeStepControl_->timeInRange (solutionHistory_->getCurrentTime()) && timeStepControl_->indexInRange(solutionHistory_->getCurrentIndex())){ stepperTimer_->reset(); @@ -483,11 +483,11 @@ void IntegratorBasicOld::checkTimeStep() } // Check Stepper failure. - if (ws->getSolutionStatus() == Status::FAILED or + if (ws->getSolutionStatus() == Status::FAILED || // Constant time step failure - ((timeStepControl_->getStepType() == "Constant") and - (ws->getTimeStep() != timeStepControl_->getInitTimeStep()) and - (ws->getOutput() != true) and + ((timeStepControl_->getStepType() == "Constant") && + (ws->getTimeStep() != timeStepControl_->getInitTimeStep()) && + (ws->getOutput() != true) && (ws->getTime() != timeStepControl_->getFinalTime()) ) ) @@ -502,7 +502,7 @@ void IntegratorBasicOld::checkTimeStep() if (ws->getSolutionStatus() == Status::FAILED) { *out << "Solution Status = " << toString(ws->getSolutionStatus()) << std::endl; - } else if ((timeStepControl_->getStepType() == "Constant") and + } else if ((timeStepControl_->getStepType() == "Constant") && (ws->getTimeStep() != timeStepControl_->getInitTimeStep())) { *out << "dt != Constant dt (="<getInitTimeStep()<<")" << std::endl; @@ -526,7 +526,7 @@ void IntegratorBasicOld::endIntegrator() { std::string exitStatus; if (solutionHistory_->getCurrentState()->getSolutionStatus() == - Status::FAILED or integratorStatus_ == Status::FAILED) { + Status::FAILED || integratorStatus_ == Status::FAILED) { exitStatus = "Time integration FAILURE!"; } else { integratorStatus_ = Status::PASSED; diff --git a/packages/tempus/src/Tempus_IntegratorBasic_decl.hpp b/packages/tempus/src/Tempus_IntegratorBasic_decl.hpp index a6bbfbd0e1ed..1170b5f65146 100644 --- a/packages/tempus/src/Tempus_IntegratorBasic_decl.hpp +++ b/packages/tempus/src/Tempus_IntegratorBasic_decl.hpp @@ -100,6 +100,9 @@ class IntegratorBasic : virtual public Tempus::Integrator /// Get the SolutionHistory virtual Teuchos::RCP > getSolutionHistory() const override {return solutionHistory_;} + /// Get the SolutionHistory + virtual Teuchos::RCP > getNonConstSolutionHistory() override + {return solutionHistory_;} /// Set the SolutionHistory virtual void setSolutionHistory( Teuchos::RCP > sh = Teuchos::null); @@ -208,6 +211,12 @@ class IntegratorBasic : virtual public Tempus::Integrator }; +/// Nonmember constructor +template +Teuchos::RCP > createIntegratorBasic( + Teuchos::RCP pList); + + /// Nonmember constructor template Teuchos::RCP > createIntegratorBasic( diff --git a/packages/tempus/src/Tempus_IntegratorBasic_impl.hpp b/packages/tempus/src/Tempus_IntegratorBasic_impl.hpp index b07d696e1379..a38565776c35 100644 --- a/packages/tempus/src/Tempus_IntegratorBasic_impl.hpp +++ b/packages/tempus/src/Tempus_IntegratorBasic_impl.hpp @@ -250,25 +250,33 @@ std::string IntegratorBasic::description() const template void IntegratorBasic::describe( - Teuchos::FancyOStream &in_out, + Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { - auto out = Teuchos::fancyOStream( in_out.getOStream() ); - out->setOutputToRootOnly(0); - *out << description() << "::describe" << std::endl; - *out << "solutionHistory= " << solutionHistory_->description()<description() << " ---" << std::endl; + + if ( solutionHistory_ != Teuchos::null ) { + solutionHistory_->describe(*l_out,verbLevel); + } else { + *l_out << "solutionHistory = " << solutionHistory_ << std::endl; } + + if ( timeStepControl_ != Teuchos::null ) { + timeStepControl_->describe(out,verbLevel); + } else { + *l_out << "timeStepControl = " << timeStepControl_ << std::endl; + } + + if ( stepper_ != Teuchos::null ) { + stepper_->describe(out,verbLevel); + } else { + *l_out << "stepper = " << stepper_ << std::endl; + } + *l_out << std::string(this->description().length()+8, '-') <::checkTimeStep() if (ws->getNFailures() >= timeStepControl_->getMaxFailures()) { RCP out = this->getOStream(); out->setOutputToRootOnly(0); - Teuchos::OSTab ostab(out,2,"checkTimeStep"); + Teuchos::OSTab ostab(out, 2, "checkTimeStep"); *out << "Failure - Stepper has failed more than the maximum allowed.\n" << " (nFailures = "<getNFailures()<< ") >= (nFailuresMax = " << timeStepControl_->getMaxFailures()<<")" << std::endl; @@ -404,7 +412,7 @@ void IntegratorBasic::checkTimeStep() >= timeStepControl_->getMaxConsecFailures()){ RCP out = this->getOStream(); out->setOutputToRootOnly(0); - Teuchos::OSTab ostab(out,1,"checkTimeStep"); + Teuchos::OSTab ostab(out, 1, "checkTimeStep"); *out << "Failure - Stepper has failed more than the maximum " << "consecutive allowed.\n" << " (nConsecutiveFailures = "<getNConsecutiveFailures() @@ -427,7 +435,7 @@ void IntegratorBasic::checkTimeStep() { RCP out = this->getOStream(); out->setOutputToRootOnly(0); - Teuchos::OSTab ostab(out,0,"checkTimeStep"); + Teuchos::OSTab ostab(out, 0, "checkTimeStep"); *out <getIndex() <getTime() @@ -550,8 +558,7 @@ IntegratorBasic::getValidParameters() const // ------------------------------------------------------------------------ template Teuchos::RCP > createIntegratorBasic( - Teuchos::RCP tempusPL, - const Teuchos::RCP >& model) + Teuchos::RCP tempusPL) { auto integratorName = tempusPL->get("Integrator Name"); auto integratorPL = Teuchos::sublist(tempusPL, integratorName, true); @@ -573,12 +580,11 @@ Teuchos::RCP > createIntegratorBasic( auto stepperPL = Teuchos::sublist(tempusPL, stepperName, true); stepperPL->setName(stepperName); auto sf = Teuchos::rcp(new StepperFactory()); - integrator->setStepper(sf->createStepper(stepperPL, model)); + integrator->setStepper(sf->createStepper(stepperPL)); } else { // Construct default Stepper - Teuchos::RCP > constModel = model; - integrator->setStepper( - createStepperForwardEuler(constModel, Teuchos::null)); + auto stepper = Teuchos::rcp(new StepperForwardEuler()); + integrator->setStepper(stepper); } // Set TimeStepControl @@ -591,23 +597,16 @@ Teuchos::RCP > createIntegratorBasic( integrator->setTimeStepControl(rcp(new TimeStepControl())); } - // Construct default IC state from the application model and TimeStepControl - auto newState = createSolutionStateME(integrator->getStepper()->getModel(), - integrator->getStepper()->getDefaultStepperState()); - newState->setTime (integrator->getTimeStepControl()->getInitTime()); - newState->setIndex (integrator->getTimeStepControl()->getInitIndex()); - newState->setTimeStep(integrator->getTimeStepControl()->getInitTimeStep()); - newState->setTolRel (integrator->getTimeStepControl()->getMaxRelError()); - newState->setTolAbs (integrator->getTimeStepControl()->getMaxAbsError()); - newState->setOrder (integrator->getStepper()->getOrder()); - newState->setSolutionStatus(Status::PASSED); // ICs are considered passing. - // Set SolutionHistory - auto shPL = Teuchos::sublist(integratorPL, "Solution History", true); - auto sh = createSolutionHistoryPL(shPL); - sh->addState(newState); - integrator->getStepper()->setInitialConditions(sh); - integrator->setSolutionHistory(sh); + if (integratorPL->isSublist("Solution History")) { + // Construct from Integrator ParameterList + auto shPL = Teuchos::sublist(integratorPL, "Solution History", true); + auto sh = createSolutionHistoryPL(shPL); + integrator->setSolutionHistory(sh); + } else { + // Construct default SolutionHistory + integrator->setSolutionHistory(createSolutionHistory()); + } // Set Observer to default. integrator->setObserver(Teuchos::null); @@ -635,6 +634,39 @@ Teuchos::RCP > createIntegratorBasic( auto vStepperPL = Teuchos::sublist(validPL, vStepperName, true); stepperPL->validateParametersAndSetDefaults(*vStepperPL); + return integrator; // integrator is not initialized (missing model and IC). +} + + +// Nonmember constructor +// ------------------------------------------------------------------------ +template +Teuchos::RCP > createIntegratorBasic( + Teuchos::RCP tempusPL, + const Teuchos::RCP >& model) +{ + auto integrator = createIntegratorBasic(tempusPL); + if ( model == Teuchos::null ) return integrator; + + Teuchos::RCP > constModel = model; + integrator->setModel(constModel); + + // Construct default IC state from the application model and TimeStepControl + auto newState = createSolutionStateME(integrator->getStepper()->getModel(), + integrator->getStepper()->getDefaultStepperState()); + newState->setTime (integrator->getTimeStepControl()->getInitTime()); + newState->setIndex (integrator->getTimeStepControl()->getInitIndex()); + newState->setTimeStep(integrator->getTimeStepControl()->getInitTimeStep()); + newState->setTolRel (integrator->getTimeStepControl()->getMaxRelError()); + newState->setTolAbs (integrator->getTimeStepControl()->getMaxAbsError()); + newState->setOrder (integrator->getStepper()->getOrder()); + newState->setSolutionStatus(Status::PASSED); // ICs are considered passing. + + // Set SolutionHistory IC + auto sh = integrator->getNonConstSolutionHistory(); + sh->addState(newState); + integrator->getStepper()->setInitialConditions(sh); + integrator->initialize(); return integrator; diff --git a/packages/tempus/src/Tempus_IntegratorForwardSensitivity_decl.hpp b/packages/tempus/src/Tempus_IntegratorForwardSensitivity_decl.hpp index 199c9a07c0ab..c556e796a6dd 100644 --- a/packages/tempus/src/Tempus_IntegratorForwardSensitivity_decl.hpp +++ b/packages/tempus/src/Tempus_IntegratorForwardSensitivity_decl.hpp @@ -164,6 +164,9 @@ class IntegratorForwardSensitivity /// Get the SolutionHistory virtual Teuchos::RCP > getSolutionHistory() const override { return integrator_->getSolutionHistory(); } + /// Get the SolutionHistory + virtual Teuchos::RCP > getNonConstSolutionHistory() override + { return integrator_->getNonConstSolutionHistory(); } /// Set the SolutionHistory virtual void setSolutionHistory( Teuchos::RCP > sh = Teuchos::null) diff --git a/packages/tempus/src/Tempus_IntegratorForwardSensitivity_impl.hpp b/packages/tempus/src/Tempus_IntegratorForwardSensitivity_impl.hpp index c71273a2910e..7b8cb372d7ea 100644 --- a/packages/tempus/src/Tempus_IntegratorForwardSensitivity_impl.hpp +++ b/packages/tempus/src/Tempus_IntegratorForwardSensitivity_impl.hpp @@ -242,13 +242,15 @@ template void IntegratorForwardSensitivity:: describe( - Teuchos::FancyOStream &in_out, + Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { - auto out = Teuchos::fancyOStream( in_out.getOStream() ); - out->setOutputToRootOnly(0); - *out << description() << "::describe" << std::endl; - integrator_->describe(in_out, verbLevel); + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << description() << "::describe" << std::endl; + integrator_->describe(*l_out, verbLevel); } template diff --git a/packages/tempus/src/Tempus_IntegratorObserverBasic_impl.hpp b/packages/tempus/src/Tempus_IntegratorObserverBasic_impl.hpp index 4472e3285e40..fc8e559d7e44 100644 --- a/packages/tempus/src/Tempus_IntegratorObserverBasic_impl.hpp +++ b/packages/tempus/src/Tempus_IntegratorObserverBasic_impl.hpp @@ -76,7 +76,7 @@ observeEndTimeStep(const Integrator& integrator){ const Teuchos::RCP out = integrator.getOStream(); out->setOutputToRootOnly(0); - Teuchos::OSTab ostab(out,0,"ScreenOutput"); + Teuchos::OSTab ostab(out, 0, "ScreenOutput"); *out<getIndex() <getTime() diff --git a/packages/tempus/src/Tempus_IntegratorObserverSubcycling_impl.hpp b/packages/tempus/src/Tempus_IntegratorObserverSubcycling_impl.hpp index 287913a77fdf..87060ddac28b 100644 --- a/packages/tempus/src/Tempus_IntegratorObserverSubcycling_impl.hpp +++ b/packages/tempus/src/Tempus_IntegratorObserverSubcycling_impl.hpp @@ -25,7 +25,7 @@ observeStartIntegrator(const Integrator& integrator){ const Teuchos::RCP out = integrator.getOStream(); out->setOutputToRootOnly(0); - Teuchos::OSTab ostab(out,0,"ScreenOutput"); + Teuchos::OSTab ostab(out, 0, "ScreenOutput"); *out << "\n Begin Subcycling -------------------------------------------------------\n"; // << " Step Time dt Abs Error Rel Error Order nFail dCompTime" // << std::endl; @@ -68,7 +68,7 @@ observeEndTimeStep(const Integrator& integrator){ const Teuchos::RCP out = integrator.getOStream(); out->setOutputToRootOnly(0); - Teuchos::OSTab ostab(out,0,"ScreenOutput"); + Teuchos::OSTab ostab(out, 0, "ScreenOutput"); *out<getIndex() <getTime() @@ -89,7 +89,7 @@ observeEndIntegrator(const Integrator& integrator){ const Teuchos::RCP out = integrator.getOStream(); out->setOutputToRootOnly(0); - Teuchos::OSTab ostab(out,0,"ScreenOutput"); + Teuchos::OSTab ostab(out, 0, "ScreenOutput"); *out << " End Subcycling ---------------------------------------------------------\n\n"; } diff --git a/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_decl.hpp b/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_decl.hpp index 187d37960096..ec2430a47cbf 100644 --- a/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_decl.hpp +++ b/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_decl.hpp @@ -109,6 +109,8 @@ class IntegratorPseudoTransientAdjointSensitivity virtual void setTempusParameterList(Teuchos::RCP pl) override; /// Get the SolutionHistory virtual Teuchos::RCP > getSolutionHistory() const override; + /// Get the SolutionHistory + virtual Teuchos::RCP > getNonConstSolutionHistory() override; /// Get the TimeStepControl virtual Teuchos::RCP > getTimeStepControl() const override; virtual Teuchos::RCP > getNonConstTimeStepControl() override; diff --git a/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_impl.hpp b/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_impl.hpp index ca347ab8981d..0867093373e6 100644 --- a/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_impl.hpp +++ b/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_impl.hpp @@ -163,6 +163,14 @@ getSolutionHistory() const return solutionHistory_; } +template +Teuchos::RCP > +IntegratorPseudoTransientAdjointSensitivity:: +getNonConstSolutionHistory() +{ + return solutionHistory_; +} + template Teuchos::RCP > IntegratorPseudoTransientAdjointSensitivity:: @@ -271,13 +279,16 @@ template void IntegratorPseudoTransientAdjointSensitivity:: describe( - Teuchos::FancyOStream &in_out, + Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { - auto out = Teuchos::fancyOStream( in_out.getOStream() ); - *out << description() << "::describe" << std::endl; - state_integrator_->describe(*out, verbLevel); - sens_integrator_->describe(*out, verbLevel); + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << description() << "::describe" << std::endl; + state_integrator_->describe(*l_out, verbLevel); + sens_integrator_->describe(*l_out, verbLevel); } template diff --git a/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_decl.hpp b/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_decl.hpp index 0bfeb3cff6e4..01dfd1996e73 100644 --- a/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_decl.hpp +++ b/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_decl.hpp @@ -123,6 +123,8 @@ class IntegratorPseudoTransientForwardSensitivity virtual void setTempusParameterList(Teuchos::RCP pl) override; /// Get the SolutionHistory virtual Teuchos::RCP > getSolutionHistory() const override; + /// Get the SolutionHistory + virtual Teuchos::RCP > getNonConstSolutionHistory() override; /// Get the TimeStepControl virtual Teuchos::RCP > getTimeStepControl() const override; virtual Teuchos::RCP > getNonConstTimeStepControl() override; diff --git a/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_impl.hpp b/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_impl.hpp index 0e7d8496befa..142c007f06ac 100644 --- a/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_impl.hpp +++ b/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_impl.hpp @@ -176,6 +176,14 @@ getSolutionHistory() const return solutionHistory_; } +template +Teuchos::RCP > +IntegratorPseudoTransientForwardSensitivity:: +getNonConstSolutionHistory() +{ + return solutionHistory_; +} + template Teuchos::RCP > IntegratorPseudoTransientForwardSensitivity:: @@ -343,14 +351,16 @@ template void IntegratorPseudoTransientForwardSensitivity:: describe( - Teuchos::FancyOStream &in_out, + Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { - auto out = Teuchos::fancyOStream( in_out.getOStream() ); - out->setOutputToRootOnly(0); - *out << description() << "::describe" << std::endl; - state_integrator_->describe(in_out, verbLevel); - sens_integrator_->describe(in_out, verbLevel); + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << description() << "::describe" << std::endl; + state_integrator_->describe(*l_out, verbLevel); + sens_integrator_->describe(*l_out, verbLevel); } template diff --git a/packages/tempus/src/Tempus_InterpolatorLagrange_decl.hpp b/packages/tempus/src/Tempus_InterpolatorLagrange_decl.hpp index 11ec213225f7..41bfec9f8223 100644 --- a/packages/tempus/src/Tempus_InterpolatorLagrange_decl.hpp +++ b/packages/tempus/src/Tempus_InterpolatorLagrange_decl.hpp @@ -48,7 +48,10 @@ class InterpolatorLagrange : virtual public Interpolator std::string description() const { return "Tempus::InterpolatorLagrange"; } void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel /* verbLevel */) const - { out << description() << "::describe" << std::endl; } + { + out.setOutputToRootOnly(0); + out << description() << "::describe" << std::endl; + } //@} /// \name Overridden from Teuchos::ParameterListAcceptor diff --git a/packages/tempus/src/Tempus_PhysicsState_impl.hpp b/packages/tempus/src/Tempus_PhysicsState_impl.hpp index 19da288f686e..5a06fbd1b054 100644 --- a/packages/tempus/src/Tempus_PhysicsState_impl.hpp +++ b/packages/tempus/src/Tempus_PhysicsState_impl.hpp @@ -55,18 +55,19 @@ void PhysicsState::setName(std::string pN) template std::string PhysicsState::description() const { - return physicsName_; + return "Tempus::PhysicsState - '" + physicsName_ + "'"; } template void PhysicsState::describe( - Teuchos::FancyOStream & in_out, + Teuchos::FancyOStream & out, const Teuchos::EVerbosityLevel /* verbLevel */) const { - auto out = Teuchos::fancyOStream( in_out.getOStream() ); - out->setOutputToRootOnly(0); - *out << description() << "::describe" << std::endl - << " physicsName = " << physicsName_ << std::endl; + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << "\n--- " << this->description() << " ---" << std::endl; } diff --git a/packages/tempus/src/Tempus_RKButcherTableau.hpp b/packages/tempus/src/Tempus_RKButcherTableau.hpp index e89578b822a5..c8dc901fcfc6 100644 --- a/packages/tempus/src/Tempus_RKButcherTableau.hpp +++ b/packages/tempus/src/Tempus_RKButcherTableau.hpp @@ -158,6 +158,8 @@ class RKButcherTableau : virtual void describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); + if (verbLevel != Teuchos::VERB_NONE) { out << this->description() << std::endl; out << "number of Stages = " << this->numStages() << std::endl; diff --git a/packages/tempus/src/Tempus_SolutionHistory.cpp b/packages/tempus/src/Tempus_SolutionHistory.cpp index 47a554bc684f..7bb6e1e3b6ab 100644 --- a/packages/tempus/src/Tempus_SolutionHistory.cpp +++ b/packages/tempus/src/Tempus_SolutionHistory.cpp @@ -16,10 +16,13 @@ namespace Tempus { TEMPUS_INSTANTIATE_TEMPLATE_CLASS(SolutionHistory) + // Nonmember constructor + template Teuchos::RCP > + createSolutionHistory(); + // Nonmember constructor from a ParameterList template Teuchos::RCP > - createSolutionHistoryPL( - Teuchos::RCP pList); + createSolutionHistoryPL(Teuchos::RCP pList); // Nonmember contructor from a SolutionState. template Teuchos::RCP > diff --git a/packages/tempus/src/Tempus_SolutionHistory_decl.hpp b/packages/tempus/src/Tempus_SolutionHistory_decl.hpp index b4fa8f88100f..bc911aa2cf0f 100644 --- a/packages/tempus/src/Tempus_SolutionHistory_decl.hpp +++ b/packages/tempus/src/Tempus_SolutionHistory_decl.hpp @@ -347,6 +347,11 @@ class SolutionHistory }; +/// Nonmember constructor +template +Teuchos::RCP > +createSolutionHistory(); + /// Nonmember constructor from a ParameterList template Teuchos::RCP > diff --git a/packages/tempus/src/Tempus_SolutionHistory_impl.hpp b/packages/tempus/src/Tempus_SolutionHistory_impl.hpp index d6929af6ecb5..fcb3693627bf 100644 --- a/packages/tempus/src/Tempus_SolutionHistory_impl.hpp +++ b/packages/tempus/src/Tempus_SolutionHistory_impl.hpp @@ -158,10 +158,10 @@ void SolutionHistory::removeState( if (state->getTime() == (*state_it)->getTime()) break; } - TEUCHOS_TEST_FOR_EXCEPTION(state_it == history_->rend(), std::logic_error, + TEUCHOS_TEST_FOR_EXCEPTION( + state_it == history_->rend(), std::logic_error, "Error - removeState() Could not remove state = " - // << state_it->describe() - ); + << (*state_it)->description()); // Need to be careful when erasing a reverse iterator. history_->erase(std::next(state_it).base()); @@ -394,10 +394,10 @@ SolutionHistory::getStateTimeIndexN(bool warn) const const int m = history_->size(); if ( m < 1 ) { if ( warn ) { - Teuchos::RCP out = this->getOStream(); - Teuchos::OSTab ostab(out,1,"SolutionHistory::getStateTimeIndexN"); - *out << "Warning - getStateTimeIndexN() No states in SolutionHistory!" - << std::endl; + Teuchos::RCP out = this->getOStream(); + Teuchos::OSTab ostab(out,1,"SolutionHistory::getStateTimeIndexN"); + *out << "Warning - getStateTimeIndexN() No states in SolutionHistory!" + << std::endl; } } else { state = (*history_)[m-1]; @@ -514,7 +514,7 @@ SolutionHistory::getStateTimeIndex(int index, bool warn) const template std::string SolutionHistory::description() const { - return ("Tempus::SolutionHistory - name = '" + name_ + "'"); + return ("Tempus::SolutionHistory - '" + name_ + "'"); } @@ -523,24 +523,30 @@ void SolutionHistory::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << "\n--- " << this->description() << " ---" << std::endl; + if ((Teuchos::as(verbLevel)==Teuchos::as(Teuchos::VERB_DEFAULT)) || (Teuchos::as(verbLevel)>=Teuchos::as(Teuchos::VERB_LOW) ) ){ - out << description() << std::endl; - //out << "interpolator = " << interpolator->description() << std::endl; - out << "storageLimit = " << storageLimit_ << std::endl; - out << "storageType = " << getStorageTypeString() << std::endl; - out << "number of states = " << history_->size() << std::endl; - out << "time range = (" << history_->front()->getTime() << ", " - << history_->back()->getTime() << ")" - << std::endl; + //*l_out << " interpolator = " << interpolator->description() << std::endl; + *l_out << " storageLimit = " << storageLimit_ << std::endl; + *l_out << " storageType = " << getStorageTypeString() << std::endl; + *l_out << " number of states = " << history_->size() << std::endl; + if ( history_->size() > 0 ) { + *l_out<<" time range = (" << history_->front()->getTime() << ", " + << history_->back()->getTime() << ")" + << std::endl; + } } if (Teuchos::as(verbLevel) >= Teuchos::as(Teuchos::VERB_MEDIUM)) { - for (int i=0; i<(int)history_->size() ; ++i) { - out << "SolutionState[" << i << "] -- "; - (*history_)[i]->describe(out, verbLevel); - } + for (int i=0; i<(int)history_->size() ; ++i) + (*history_)[i]->describe(*l_out, verbLevel); } + *l_out << std::string(this->description().length()+8, '-') << std::endl; } @@ -682,11 +688,22 @@ void SolutionHistory::initialize() const // Nonmember constructors. // ------------------------------------------------------------------------ +template +Teuchos::RCP > createSolutionHistory() +{ + auto sh = rcp(new SolutionHistory()); + sh->setName("From createSolutionHistory"); + + return sh; +} + + template Teuchos::RCP > createSolutionHistoryPL( Teuchos::RCP pl) { auto sh = rcp(new SolutionHistory()); + sh->setName("From createSolutionHistoryPL"); if (pl == Teuchos::null) return sh; // Return default SolutionHistory. diff --git a/packages/tempus/src/Tempus_SolutionStateMetaData_impl.hpp b/packages/tempus/src/Tempus_SolutionStateMetaData_impl.hpp index 71ee89134856..22c8b487ec62 100644 --- a/packages/tempus/src/Tempus_SolutionStateMetaData_impl.hpp +++ b/packages/tempus/src/Tempus_SolutionStateMetaData_impl.hpp @@ -181,32 +181,36 @@ void SolutionStateMetaData::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { - if (verbLevel == Teuchos::VERB_EXTREME) { - auto l_out = Teuchos::fancyOStream( out.getOStream() ); - l_out->setOutputToRootOnly(0); - *l_out << description() << "::describe:" << std::endl - << "time = " << time_ << std::endl - << "iStep = " << iStep_ << std::endl - << "dt = " << dt_ << std::endl - << "errorAbs = " << errorAbs_ << std::endl - << "errorRel = " << errorRel_ << std::endl - << "order = " << order_ << std::endl - << "nFailures = " << nFailures_ << std::endl - << "nRunningFailures = " << nRunningFailures_<< std::endl - << "nConsecutiveFailures = " << nConsecutiveFailures_ << std::endl - << "tolRel = " << tolRel_ << std::endl - << "tolAbs = " << tolAbs_ << std::endl - << "xNormL2 = " << xNormL2_ << std::endl - << "dxNormL2Rel = " << dxNormL2Rel_ << std::endl - << "dxNormL2Abs = " << dxNormL2Abs_ << std::endl - << "computeNorms = " << computeNorms_ << std::endl - << "solutionStatus = " << toString(solutionStatus_) << std::endl - << "output = " << output_ << std::endl - << "outputScreen = " << outputScreen_ << std::endl - << "isSynced = " << isSynced_ << std::endl - << "isInterpolated = " << isInterpolated_ << std::endl - << "accuracy = " << accuracy_ << std::endl; + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << "\n--- " << this->description() << " ---" <= Teuchos::VERB_MEDIUM) { + *l_out << " time = " << time_ << std::endl + << " iStep = " << iStep_ << std::endl + << " dt = " << dt_ << std::endl + << " errorAbs = " << errorAbs_ << std::endl + << " errorRel = " << errorRel_ << std::endl + << " order = " << order_ << std::endl + << " nFailures = " << nFailures_ << std::endl + << " nRunningFailures = " << nRunningFailures_<< std::endl + << " nConsecutiveFailures = " << nConsecutiveFailures_ << std::endl + << " tolRel = " << tolRel_ << std::endl + << " tolAbs = " << tolAbs_ << std::endl + << " xNormL2 = " << xNormL2_ << std::endl + << " dxNormL2Rel = " << dxNormL2Rel_ << std::endl + << " dxNormL2Abs = " << dxNormL2Abs_ << std::endl + << " computeNorms = " << computeNorms_ << std::endl + << " solutionStatus = " << toString(solutionStatus_) << std::endl + << " output = " << output_ << std::endl + << " outputScreen = " << outputScreen_ << std::endl + << " isSynced = " << isSynced_ << std::endl + << " isInterpolated = " << isInterpolated_ << std::endl + << " accuracy = " << accuracy_ << std::endl; } + *l_out << std::string(this->description().length()+8, '-') <::operator== (const Scalar& t) const template std::string SolutionState::description() const { - std::string name = "Tempus::SolutionState"; - return (name); + std::ostringstream out; + out << "SolutionState" + << " (index =" <getIndex() + << "; time =" <getTime() + << "; dt =" <getTimeStep() + << ")"; + return out.str(); } template @@ -408,35 +413,33 @@ void SolutionState::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { - if (verbLevel == Teuchos::VERB_MEDIUM) { - out << "(index =" <getIndex() - << "; time =" <getTime() - << "; dt =" <getTimeStep() - << ")" << std::endl; - } + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << "\n--- " << this->description() << " ---" << std::endl; + + if (Teuchos::as(verbLevel) >= Teuchos::as(Teuchos::VERB_EXTREME)) { + + metaData_->describe(*l_out,verbLevel); + *l_out << " x = " << std::endl; + x_->describe(*l_out,verbLevel); - if (verbLevel == Teuchos::VERB_EXTREME) { - out << description() << "::describe:" << std::endl - << "metaData = " << std::endl; - metaData_->describe(out,verbLevel); - out << "x = " << std::endl; - x_->describe(out,verbLevel); if (xdot_ != Teuchos::null) { - out << "xdot_ = " << std::endl; - xdot_->describe(out,verbLevel); + *l_out << " xdot_ = " << std::endl; + xdot_->describe(*l_out,verbLevel); } if (xdotdot_ != Teuchos::null) { - out << "xdotdot = " << std::endl; - xdotdot_->describe(out,verbLevel); - } - if (stepperState_ != Teuchos::null) { - out << "stepperState = " << std::endl; - stepperState_->describe(out,verbLevel); - } - if (physicsState_ != Teuchos::null) { - out << "physicsState = " << std::endl; - physicsState_->describe(out,verbLevel); + *l_out << " xdotdot = " << std::endl; + xdotdot_->describe(*l_out,verbLevel); } + + if (stepperState_ != Teuchos::null) + stepperState_->describe(*l_out,verbLevel); + if (physicsState_ != Teuchos::null) + physicsState_->describe(*l_out,verbLevel); + + *l_out << std::string(this->description().length()+8, '-') <::setModel( const Teuchos::RCP >& appModel) { StepperImplicit::setModel(appModel); + // If the startUpStepper's model is not set, set it to the stepper model. if (startUpStepper_->getModel() == Teuchos::null) { startUpStepper_->setModel(appModel); startUpStepper_->initialize(); @@ -275,6 +276,7 @@ void StepperBDF2::describe( const Teuchos::EVerbosityLevel verbLevel ) const { auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); l_out->setOutputToRootOnly(0); *l_out << std::endl; Stepper::describe(out, verbLevel); @@ -340,13 +342,13 @@ createStepperBDF2( auto stepper = Teuchos::rcp(new StepperBDF2()); stepper->setStepperImplicitValues(pl); + std::string startUpStepperName = "DIRK 1 Stage Theta Method"; + if (pl != Teuchos::null) startUpStepperName = + pl->get("Start Up Stepper Type", startUpStepperName); + stepper->setStartUpStepper(startUpStepperName); + if (model != Teuchos::null) { stepper->setModel(model); - - std::string startUpStepperName = "DIRK 1 Stage Theta Method"; - if (pl != Teuchos::null) startUpStepperName = - pl->get("Start Up Stepper Type", startUpStepperName); - stepper->setStartUpStepper(startUpStepperName); stepper->initialize(); } diff --git a/packages/tempus/src/Tempus_StepperBackwardEuler_decl.hpp b/packages/tempus/src/Tempus_StepperBackwardEuler_decl.hpp index ef8b500f3aec..fdb3cfa8e0f9 100644 --- a/packages/tempus/src/Tempus_StepperBackwardEuler_decl.hpp +++ b/packages/tempus/src/Tempus_StepperBackwardEuler_decl.hpp @@ -108,76 +108,81 @@ class StepperBackwardEuler : void setPredictor(std::string predictorType = "None"); void setPredictor(Teuchos::RCP > predictorStepper); + /// Set the model + virtual void setModel( + const Teuchos::RCP >& appModel) override; + /// Set the initial conditions and make them consistent. virtual void setInitialConditions ( - const Teuchos::RCP >& solutionHistory); + const Teuchos::RCP >& solutionHistory) override; /// Take the specified timestep, dt, and return true if successful. virtual void takeStep( - const Teuchos::RCP >& solutionHistory); + const Teuchos::RCP >& solutionHistory) override; /// Get a default (initial) StepperState - virtual Teuchos::RCP > getDefaultStepperState(); - virtual Scalar getOrder() const {return 1.0;} - virtual Scalar getOrderMin() const {return 1.0;} - virtual Scalar getOrderMax() const {return 1.0;} - - virtual bool isExplicit() const {return false;} - virtual bool isImplicit() const {return true;} - virtual bool isExplicitImplicit() const + virtual Teuchos::RCP > getDefaultStepperState() override; + virtual Scalar getOrder() const override {return 1.0;} + virtual Scalar getOrderMin() const override {return 1.0;} + virtual Scalar getOrderMax() const override {return 1.0;} + + virtual bool isExplicit() const override {return false;} + virtual bool isImplicit() const override {return true;} + virtual bool isExplicitImplicit() const override {return isExplicit() && isImplicit();} - virtual bool isOneStepMethod() const {return true;} - virtual bool isMultiStepMethod() const {return !isOneStepMethod();} - virtual OrderODE getOrderODE() const {return FIRST_ORDER_ODE;} + virtual bool isOneStepMethod() const override {return true;} + virtual bool isMultiStepMethod() const override {return !isOneStepMethod();} + virtual OrderODE getOrderODE() const override {return FIRST_ORDER_ODE;} //@} /// Return alpha = d(xDot)/dx. - virtual Scalar getAlpha(const Scalar dt) const { return Scalar(1.0)/dt; } + virtual Scalar getAlpha(const Scalar dt) const override { return Scalar(1.0)/dt; } /// Return beta = d(x)/dx. - virtual Scalar getBeta (const Scalar ) const { return Scalar(1.0); } + virtual Scalar getBeta (const Scalar ) const override { return Scalar(1.0); } /// Compute predictor given the supplied stepper virtual void computePredictor( const Teuchos::RCP >& solutionHistory); - Teuchos::RCP getValidParameters() const; + /// Return a valid ParameterList with current settings. + Teuchos::RCP getValidParameters() const override; /// \name Overridden from Teuchos::Describable //@{ virtual void describe(Teuchos::FancyOStream & out, - const Teuchos::EVerbosityLevel verbLevel) const; + const Teuchos::EVerbosityLevel verbLevel) const override; //@} - virtual bool isValidSetup(Teuchos::FancyOStream & out) const; + virtual bool isValidSetup(Teuchos::FancyOStream & out) const override; /// \name Implementation of StepperOptimizationInterface //@{ - virtual int stencilLength() const; + virtual int stencilLength() const override; virtual void computeStepResidual( Thyra::VectorBase& residual, const Teuchos::Array< Teuchos::RCP > >& x, const Teuchos::Array& t, const Thyra::VectorBase& p, - const int param_index) const; + const int param_index) const override; virtual void computeStepJacobian( Thyra::LinearOpBase& jacobian, const Teuchos::Array< Teuchos::RCP > >& x, const Teuchos::Array& t, const Thyra::VectorBase& p, const int param_index, - const int deriv_index) const; + const int deriv_index) const override; virtual void computeStepParamDeriv( Thyra::LinearOpBase& deriv, const Teuchos::Array< Teuchos::RCP > >& x, const Teuchos::Array& t, const Thyra::VectorBase& p, - const int param_index) const; + const int param_index) const override; virtual void computeStepSolver( Thyra::LinearOpWithSolveBase& jacobian_solver, const Teuchos::Array< Teuchos::RCP > >& x, const Teuchos::Array& t, const Thyra::VectorBase& p, - const int param_index) const; + const int param_index) const override; //@} private: diff --git a/packages/tempus/src/Tempus_StepperBackwardEuler_impl.hpp b/packages/tempus/src/Tempus_StepperBackwardEuler_impl.hpp index b1dd127a2fd0..a9f5f792bb2e 100644 --- a/packages/tempus/src/Tempus_StepperBackwardEuler_impl.hpp +++ b/packages/tempus/src/Tempus_StepperBackwardEuler_impl.hpp @@ -120,6 +120,24 @@ void StepperBackwardEuler::setAppAction( } +template +void StepperBackwardEuler::setModel( + const Teuchos::RCP >& appModel) +{ + StepperImplicit::setModel(appModel); + + if (predictorStepper_ != Teuchos::null) { + // If predictor's model is not set, set it to the stepper model. + if (predictorStepper_->getModel() == Teuchos::null) { + predictorStepper_->setModel(appModel); + predictorStepper_->initialize(); + } + } + + this->isInitialized_ = false; +} + + template void StepperBackwardEuler::setInitialConditions( const Teuchos::RCP >& solutionHistory) @@ -247,6 +265,7 @@ void StepperBackwardEuler::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); out << std::endl; Stepper::describe(out, verbLevel); StepperImplicit::describe(out, verbLevel); @@ -269,6 +288,7 @@ void StepperBackwardEuler::describe( template bool StepperBackwardEuler::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; @@ -438,14 +458,14 @@ createStepperBackwardEuler( stepper->setStepperImplicitValues(pl); + if (pl != Teuchos::null) { + std::string predictorName = + pl->get("Predictor Stepper Type", "None"); + stepper->setPredictor(predictorName); + } + if (model != Teuchos::null) { stepper->setModel(model); - - if (pl != Teuchos::null) { - std::string predictorName = - pl->get("Predictor Stepper Type", "None"); - stepper->setPredictor(predictorName); - } stepper->initialize(); } diff --git a/packages/tempus/src/Tempus_StepperDIRK_decl.hpp b/packages/tempus/src/Tempus_StepperDIRK_decl.hpp index d3098d4a7d85..6658a66c789e 100644 --- a/packages/tempus/src/Tempus_StepperDIRK_decl.hpp +++ b/packages/tempus/src/Tempus_StepperDIRK_decl.hpp @@ -161,11 +161,15 @@ class StepperDIRK : virtual public Tempus::StepperImplicit, /// \name Basic stepper methods //@{ /// Initialize after construction and changing input parameters. - virtual void initialize(); + virtual void initialize() override; + + /// Set the model + virtual void setModel( + const Teuchos::RCP >& appModel) override; /// Set the initial conditions and make them consistent. virtual void setInitialConditions ( - const Teuchos::RCP >& solutionHistory); + const Teuchos::RCP >& solutionHistory) override; /// Set parameter so that the initial guess is reset at the beginning of each timestep. virtual void setResetInitialGuess(bool reset_guess) @@ -175,26 +179,26 @@ class StepperDIRK : virtual public Tempus::StepperImplicit, /// Take the specified timestep, dt, and return true if successful. virtual void takeStep( - const Teuchos::RCP >& solutionHistory); + const Teuchos::RCP >& solutionHistory) override; /// Get a default (initial) StepperState - virtual Teuchos::RCP >getDefaultStepperState(); + virtual Teuchos::RCP >getDefaultStepperState() override; - virtual bool isExplicit() const + virtual bool isExplicit() const override { const int numStages = this->tableau_->numStages(); Teuchos::SerialDenseMatrix A = this->tableau_->A(); bool isExplicit = false; for (int i=0; itableau_->isDIRK(); } - virtual bool isImplicit() const {return true;} - virtual bool isExplicitImplicit() const + virtual bool isImplicit() const override {return true;} + virtual bool isExplicitImplicit() const override {return isExplicit() && isImplicit();} - virtual bool isOneStepMethod() const {return true;} - virtual bool isMultiStepMethod() const {return !isOneStepMethod();} + virtual bool isOneStepMethod() const override {return true;} + virtual bool isMultiStepMethod() const override {return !isOneStepMethod();} - virtual OrderODE getOrderODE() const {return FIRST_ORDER_ODE;} + virtual OrderODE getOrderODE() const override {return FIRST_ORDER_ODE;} virtual std::string getDescription() const = 0; //@} @@ -203,25 +207,38 @@ class StepperDIRK : virtual public Tempus::StepperImplicit, Teuchos::RCP >& getXTilde() {return xTilde_;} /// Return alpha = d(xDot)/dx. - virtual Scalar getAlpha(const Scalar dt) const + virtual Scalar getAlpha(const Scalar dt) const override { + const int numStages = this->tableau_->numStages(); const Teuchos::SerialDenseMatrix & A=this->tableau_->A(); - return Scalar(1.0)/(dt*A(0,0)); // Getting the first diagonal coeff! + Scalar aii = A(0,0); + for (int i=0; i::infinity() : Scalar(1.0)/(dt*aii); } /// Return beta = d(x)/dx. - virtual Scalar getBeta (const Scalar ) const { return Scalar(1.0); } + virtual Scalar getBeta (const Scalar ) const override { return Scalar(1.0); } + + /// Return alpha = d(xDot)/dx for stage i. + virtual Scalar getAlpha(const Scalar dt, int i) const + { + const Teuchos::SerialDenseMatrix & A=this->tableau_->A(); + return (A(i,i) == 0.0) ? std::numeric_limits::infinity() : Scalar(1.0)/(dt*A(i,i)); + } - virtual Teuchos::RCP getValidParameters() const; + virtual Teuchos::RCP getValidParameters() const override; Teuchos::RCP getValidParametersBasicDIRK() const; /// \name Overridden from Teuchos::Describable //@{ virtual void describe(Teuchos::FancyOStream & out, - const Teuchos::EVerbosityLevel verbLevel) const; + const Teuchos::EVerbosityLevel verbLevel) const override; //@} - virtual bool isValidSetup(Teuchos::FancyOStream & out) const; + virtual bool isValidSetup(Teuchos::FancyOStream & out) const override; /// Set StepperDIRK member data from the ParameterList. virtual void setStepperDIRKValues(Teuchos::RCP pl) @@ -258,6 +275,9 @@ class StepperDIRK : virtual public Tempus::StepperImplicit, virtual void setupTableau() = 0; + virtual void setEmbeddedMemory() override; + + std::vector > > stageXDot_; Teuchos::RCP > xTilde_; diff --git a/packages/tempus/src/Tempus_StepperDIRK_impl.hpp b/packages/tempus/src/Tempus_StepperDIRK_impl.hpp index b9265ce89aa6..24656daf38e6 100644 --- a/packages/tempus/src/Tempus_StepperDIRK_impl.hpp +++ b/packages/tempus/src/Tempus_StepperDIRK_impl.hpp @@ -76,24 +76,59 @@ StepperDIRK::getValidParametersBasicDIRK() const template void StepperDIRK::initialize() { - // Initialize the stage vectors + TEUCHOS_TEST_FOR_EXCEPTION( + this->tableau_ == Teuchos::null, std::logic_error, + "Error - Need to set the tableau, before calling " + "StepperDIRK::initialize()\n"); + + TEUCHOS_TEST_FOR_EXCEPTION( + this->wrapperModel_==Teuchos::null, std::logic_error, + "Error - Need to set the model, setModel(), before calling " + "StepperDIRK::initialize()\n"); + + StepperImplicit::initialize(); +} + + +template +void StepperDIRK::setModel( + const Teuchos::RCP >& appModel) +{ + StepperImplicit::setModel(appModel); + + // Set the stage vectors const int numStages = this->tableau_->numStages(); stageXDot_.resize(numStages); for (int i=0; iwrapperModel_->get_f_space()); assign(stageXDot_[i].ptr(), Teuchos::ScalarTraits::zero()); } - xTilde_ = Thyra::createMember(this->wrapperModel_->get_x_space()); - assign(xTilde_.ptr(), Teuchos::ScalarTraits::zero()); + xTilde_ = Thyra::createMember(this->wrapperModel_->get_x_space()); + assign(xTilde_.ptr(), Teuchos::ScalarTraits::zero()); + + this->setEmbeddedMemory(); + + this->isInitialized_ = false; +} + + +template +void StepperDIRK::setEmbeddedMemory() +{ + if (this->getModel() == Teuchos::null) + return; // Embedded memory will be set when setModel() is called. if (this->tableau_->isEmbedded() && this->getUseEmbedded()) { this->ee_ = Thyra::createMember(this->wrapperModel_->get_f_space()); this->abs_u0 = Thyra::createMember(this->wrapperModel_->get_f_space()); this->abs_u = Thyra::createMember(this->wrapperModel_->get_f_space()); this->sc = Thyra::createMember(this->wrapperModel_->get_f_space()); + } else { + this->ee_ = Teuchos::null; + this->abs_u0 = Teuchos::null; + this->abs_u = Teuchos::null; + this->sc = Teuchos::null; } - - StepperImplicit::initialize(); } @@ -302,6 +337,7 @@ void StepperDIRK::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); out << std::endl; Stepper::describe(out, verbLevel); StepperImplicit::describe(out, verbLevel); @@ -328,6 +364,7 @@ void StepperDIRK::describe( template bool StepperDIRK::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; diff --git a/packages/tempus/src/Tempus_StepperExplicitRK_decl.hpp b/packages/tempus/src/Tempus_StepperExplicitRK_decl.hpp index 8f664f351f89..60dceefd8cd0 100644 --- a/packages/tempus/src/Tempus_StepperExplicitRK_decl.hpp +++ b/packages/tempus/src/Tempus_StepperExplicitRK_decl.hpp @@ -108,6 +108,10 @@ class StepperExplicitRK : virtual public Tempus::StepperExplicit, /// Initialize during construction and after changing input parameters. virtual void initialize(); + /// Set model + virtual void setModel( + const Teuchos::RCP >& appModel); + /// Set the initial conditions and make them consistent. virtual void setInitialConditions ( const Teuchos::RCP >& solutionHistory); @@ -162,6 +166,8 @@ class StepperExplicitRK : virtual public Tempus::StepperExplicit, virtual void setupTableau() = 0; + virtual void setEmbeddedMemory(); + std::vector > > stageXDot_; diff --git a/packages/tempus/src/Tempus_StepperExplicitRK_impl.hpp b/packages/tempus/src/Tempus_StepperExplicitRK_impl.hpp index fcfd33c0d7b8..cb6afef5a5e6 100644 --- a/packages/tempus/src/Tempus_StepperExplicitRK_impl.hpp +++ b/packages/tempus/src/Tempus_StepperExplicitRK_impl.hpp @@ -167,7 +167,17 @@ void StepperExplicitRK::initialize() "Error - Need to set the model, setModel(), before calling " "StepperExplicitRK::initialize()\n"); - // Initialize the stage vectors + Stepper::initialize(); +} + + +template +void StepperExplicitRK::setModel( + const Teuchos::RCP >& appModel) +{ + StepperExplicit::setModel(appModel); + + // Set the stage vectors int numStages = this->tableau_->numStages(); stageXDot_.resize(numStages); for (int i=0; i::initialize() assign(stageXDot_[i].ptr(), Teuchos::ScalarTraits::zero()); } + this->setEmbeddedMemory(); + + this->isInitialized_ = false; +} + + +template +void StepperExplicitRK::setEmbeddedMemory() +{ + if (this->getModel() == Teuchos::null) + return; // Embedded memory will be set when setModel() is called. + if ( this->tableau_->isEmbedded() && this->getUseEmbedded() ){ - this->ee_ = Thyra::createMember(this->appModel_->get_f_space()); - this->abs_u0 = Thyra::createMember(this->appModel_->get_f_space()); - this->abs_u = Thyra::createMember(this->appModel_->get_f_space()); - this->sc = Thyra::createMember(this->appModel_->get_f_space()); + this->ee_ = Thyra::createMember(this->appModel_->get_f_space()); + this->abs_u0 = Thyra::createMember(this->appModel_->get_f_space()); + this->abs_u = Thyra::createMember(this->appModel_->get_f_space()); + this->sc = Thyra::createMember(this->appModel_->get_f_space()); + } else { + this->ee_ = Teuchos::null; + this->abs_u0 = Teuchos::null; + this->abs_u = Teuchos::null; + this->sc = Teuchos::null; } - - Stepper::initialize(); } @@ -313,7 +338,7 @@ void StepperExplicitRK::takeStep( Teuchos::SerialDenseVector errWght = b ; errWght -= this->tableau_->bstar(); - //compute local truncation error estimate: | u^{n+1} - \hat{u}^{n+1} | + // Compute local truncation error estimate: | u^{n+1} - \hat{u}^{n+1} | // Sum for solution: ee_n = Sum{ (b(i) - bstar(i)) * dt*f(i) } assign(this->ee_.ptr(), Teuchos::ScalarTraits::zero()); for (int i=0; i < numStages; ++i) { @@ -322,13 +347,13 @@ void StepperExplicitRK::takeStep( } } - // compute: Atol + max(|u^n|, |u^{n+1}| ) * Rtol + // Compute: Atol + max(|u^n|, |u^{n+1}| ) * Rtol Thyra::abs( *(currentState->getX()), this->abs_u0.ptr()); Thyra::abs( *(workingState->getX()), this->abs_u.ptr()); Thyra::pair_wise_max_update(tolRel, *this->abs_u0, this->abs_u.ptr()); Thyra::add_scalar(tolAbs, this->abs_u.ptr()); - //compute: || ee / sc || + // Compute: || ee / sc || assign(this->sc.ptr(), Teuchos::ScalarTraits::zero()); Thyra::ele_wise_divide(Teuchos::as(1.0), *this->ee_, *this->abs_u,this->sc.ptr()); @@ -336,7 +361,7 @@ void StepperExplicitRK::takeStep( Scalar err = std::abs(Thyra::norm(*this->sc)) / space_dim ; workingState->setErrorRel(err); - // test if step should be rejected + // Test if step should be rejected if (std::isinf(err) || std::isnan(err) || err > Teuchos::as(1.0)) workingState->setSolutionStatus(Status::FAILED); } @@ -371,6 +396,7 @@ void StepperExplicitRK::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); out << std::endl; Stepper::describe(out, verbLevel); StepperExplicit::describe(out, verbLevel); @@ -396,6 +422,7 @@ void StepperExplicitRK::describe( template bool StepperExplicitRK::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; diff --git a/packages/tempus/src/Tempus_StepperExplicit_decl.hpp b/packages/tempus/src/Tempus_StepperExplicit_decl.hpp index a3a9a7efc76e..354b8a63b8a1 100644 --- a/packages/tempus/src/Tempus_StepperExplicit_decl.hpp +++ b/packages/tempus/src/Tempus_StepperExplicit_decl.hpp @@ -46,11 +46,13 @@ class StepperExplicit : virtual public Tempus::Stepper /// \name Basic explicit stepper methods //@{ + /// Set model virtual void setModel( const Teuchos::RCP >& appModel); + /// Return the application ModelEvaluator. virtual Teuchos::RCP > - getModel(){return appModel_;} + getModel() const {return appModel_;} virtual Scalar getInitTimeStep( const Teuchos::RCP >& /* solutionHistory */) const diff --git a/packages/tempus/src/Tempus_StepperExplicit_impl.hpp b/packages/tempus/src/Tempus_StepperExplicit_impl.hpp index af7f2868a30c..fcf39e451320 100644 --- a/packages/tempus/src/Tempus_StepperExplicit_impl.hpp +++ b/packages/tempus/src/Tempus_StepperExplicit_impl.hpp @@ -319,16 +319,21 @@ template void StepperExplicit::describe(Teuchos::FancyOStream & out, const Teuchos::EVerbosityLevel verbLevel) const { - out << "--- StepperExplicit ---\n"; - out << " appModel_ = " << appModel_ << std::endl; - out << " inArgs_ = " << inArgs_ << std::endl; - out << " outArgs_ = " << outArgs_ << std::endl; + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << "--- StepperExplicit ---\n" + << " appModel_ = " << appModel_ << std::endl + << " inArgs_ = " << inArgs_ << std::endl + << " outArgs_ = " << outArgs_ << std::endl; } template bool StepperExplicit::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if (appModel_ == Teuchos::null) { diff --git a/packages/tempus/src/Tempus_StepperFactory_impl.hpp b/packages/tempus/src/Tempus_StepperFactory_impl.hpp index 46568dd249c0..c6b29a299fa3 100644 --- a/packages/tempus/src/Tempus_StepperFactory_impl.hpp +++ b/packages/tempus/src/Tempus_StepperFactory_impl.hpp @@ -198,6 +198,7 @@ createStepper( else { Teuchos::RCP out = Teuchos::VerboseObjectBase::getDefaultOStream(); + out->setOutputToRootOnly(0); Teuchos::OSTab ostab(out,1,"StepperFactory::createStepper"); *out << "Unknown Stepper Type! ('"+stepperType+"').\n" diff --git a/packages/tempus/src/Tempus_StepperForwardEuler_impl.hpp b/packages/tempus/src/Tempus_StepperForwardEuler_impl.hpp index 917164a7a8ec..3ddefeb75b13 100644 --- a/packages/tempus/src/Tempus_StepperForwardEuler_impl.hpp +++ b/packages/tempus/src/Tempus_StepperForwardEuler_impl.hpp @@ -185,18 +185,24 @@ void StepperForwardEuler::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { - out << std::endl; - Stepper::describe(out, verbLevel); - StepperExplicit::describe(out, verbLevel); - out << " stepperFEAppAction_ = " - << stepperFEAppAction_ << std::endl; - out << "----------------------------" << std::endl; + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << std::endl; + Stepper::describe(*l_out, verbLevel); + StepperExplicit::describe(*l_out, verbLevel); + *l_out << " stepperFEAppAction_ = " + << stepperFEAppAction_ << std::endl + << "----------------------------" << std::endl; } template bool StepperForwardEuler::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); + bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; diff --git a/packages/tempus/src/Tempus_StepperHHTAlpha_impl.hpp b/packages/tempus/src/Tempus_StepperHHTAlpha_impl.hpp index ba1c317ada88..447e82e700ba 100644 --- a/packages/tempus/src/Tempus_StepperHHTAlpha_impl.hpp +++ b/packages/tempus/src/Tempus_StepperHHTAlpha_impl.hpp @@ -488,17 +488,18 @@ void StepperHHTAlpha::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); #ifdef VERBOSE_DEBUG_OUTPUT *out_ << "DEBUG: " << __PRETTY_FUNCTION__ << "\n"; #endif - out << std::endl; - Stepper::describe(out, verbLevel); - StepperImplicit::describe(out, verbLevel); + *l_out << std::endl; + Stepper::describe(*l_out, verbLevel); + StepperImplicit::describe(*l_out, verbLevel); - auto l_out = Teuchos::fancyOStream( out.getOStream() ); - l_out->setOutputToRootOnly(0); *l_out << "--- StepperHHTAlpha ---\n"; *l_out << " schemeName_ = " << schemeName_ << std::endl; *l_out << " beta_ = " << beta_ << std::endl; @@ -512,6 +513,7 @@ void StepperHHTAlpha::describe( template bool StepperHHTAlpha::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; diff --git a/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_decl.hpp b/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_decl.hpp index 40dd3f51a884..81344b2cb50d 100644 --- a/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_decl.hpp +++ b/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_decl.hpp @@ -377,7 +377,7 @@ class StepperIMEX_RK_Partition : virtual public Tempus::StepperImplicit, virtual void setModel( const Teuchos::RCP >& appModel); - virtual Teuchos::RCP > getModel() + virtual Teuchos::RCP > getModel() const { return this->wrapperModel_; } virtual void setModelPair( diff --git a/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_impl.hpp b/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_impl.hpp index 361a32c6f1d4..0a8eae3d6f1d 100644 --- a/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_impl.hpp +++ b/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_impl.hpp @@ -788,6 +788,7 @@ void StepperIMEX_RK_Partition::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); out << std::endl; Stepper::describe(out, verbLevel); StepperImplicit::describe(out, verbLevel); @@ -817,6 +818,8 @@ void StepperIMEX_RK_Partition::describe( template bool StepperIMEX_RK_Partition::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); + bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; diff --git a/packages/tempus/src/Tempus_StepperIMEX_RK_decl.hpp b/packages/tempus/src/Tempus_StepperIMEX_RK_decl.hpp index 9d707971c285..b69b52c4b670 100644 --- a/packages/tempus/src/Tempus_StepperIMEX_RK_decl.hpp +++ b/packages/tempus/src/Tempus_StepperIMEX_RK_decl.hpp @@ -347,7 +347,7 @@ class StepperIMEX_RK : virtual public Tempus::StepperImplicit, virtual void setModel( const Teuchos::RCP >& appModel); - virtual Teuchos::RCP > getModel() + virtual Teuchos::RCP > getModel() const { return this->wrapperModel_; } virtual void setModelPair( diff --git a/packages/tempus/src/Tempus_StepperIMEX_RK_impl.hpp b/packages/tempus/src/Tempus_StepperIMEX_RK_impl.hpp index 1e88b3b7693f..1ac0de02f7c9 100644 --- a/packages/tempus/src/Tempus_StepperIMEX_RK_impl.hpp +++ b/packages/tempus/src/Tempus_StepperIMEX_RK_impl.hpp @@ -859,6 +859,8 @@ void StepperIMEX_RK::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); + out << std::endl; Stepper::describe(out, verbLevel); StepperImplicit::describe(out, verbLevel); @@ -888,6 +890,7 @@ void StepperIMEX_RK::describe( template bool StepperIMEX_RK::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; diff --git a/packages/tempus/src/Tempus_StepperImplicit_decl.hpp b/packages/tempus/src/Tempus_StepperImplicit_decl.hpp index 49ce7ef997ac..d95af0a98c98 100644 --- a/packages/tempus/src/Tempus_StepperImplicit_decl.hpp +++ b/packages/tempus/src/Tempus_StepperImplicit_decl.hpp @@ -231,10 +231,11 @@ class StepperImplicit : virtual public Tempus::Stepper /// \name Basic implicit stepper methods //@{ + /// Set the model virtual void setModel( - const Teuchos::RCP >& appModel); + const Teuchos::RCP >& appModel) override; - virtual Teuchos::RCP > getModel() + virtual Teuchos::RCP > getModel() const override { Teuchos::RCP > model; if (wrapperModel_ != Teuchos::null) model = wrapperModel_->getAppModel(); @@ -248,14 +249,14 @@ class StepperImplicit : virtual public Tempus::Stepper /// Set solver. virtual void setSolver( - Teuchos::RCP > solver); + Teuchos::RCP > solver) override; - virtual Teuchos::RCP > getSolver() const + virtual Teuchos::RCP > getSolver() const override { return solver_; } /// Set the initial conditions and make them consistent. virtual void setInitialConditions ( - const Teuchos::RCP >& solutionHistory); + const Teuchos::RCP >& solutionHistory) override; /// Return alpha = d(xDot)/dx. virtual Scalar getAlpha(const Scalar dt) const = 0; @@ -284,7 +285,7 @@ class StepperImplicit : virtual public Tempus::Stepper /// Pass initial guess to Newton solver (only relevant for implicit solvers) virtual void setInitialGuess( - Teuchos::RCP > initialGuess) + Teuchos::RCP > initialGuess) override { initialGuess_ = initialGuess; this->isInitialized_ = false; @@ -299,19 +300,19 @@ class StepperImplicit : virtual public Tempus::Stepper virtual bool getZeroInitialGuess() const { return zeroInitialGuess_; } virtual Scalar getInitTimeStep( - const Teuchos::RCP >& /* solutionHistory */) const + const Teuchos::RCP >& /* solutionHistory */) const override {return Scalar(1.0e+99);} //@} /// \name Overridden from Teuchos::Describable //@{ virtual void describe(Teuchos::FancyOStream & out, - const Teuchos::EVerbosityLevel verbLevel) const; + const Teuchos::EVerbosityLevel verbLevel) const override; //@} - virtual bool isValidSetup(Teuchos::FancyOStream & out) const; + virtual bool isValidSetup(Teuchos::FancyOStream & out) const override; - virtual Teuchos::RCP getValidParameters() const; + virtual Teuchos::RCP getValidParameters() const override; Teuchos::RCP getValidParametersBasicImplicit() const; diff --git a/packages/tempus/src/Tempus_StepperImplicit_impl.hpp b/packages/tempus/src/Tempus_StepperImplicit_impl.hpp index d3efae855f47..3048735a90c0 100644 --- a/packages/tempus/src/Tempus_StepperImplicit_impl.hpp +++ b/packages/tempus/src/Tempus_StepperImplicit_impl.hpp @@ -328,6 +328,7 @@ template void StepperImplicit::describe(Teuchos::FancyOStream & out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); out << "--- StepperImplicit ---\n"; out << " wrapperModel_ = " << wrapperModel_ << std::endl; out << " solver_ = " << solver_ << std::endl; @@ -340,6 +341,7 @@ void StepperImplicit::describe(Teuchos::FancyOStream & out, template bool StepperImplicit::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if (wrapperModel_->getAppModel() == Teuchos::null) { diff --git a/packages/tempus/src/Tempus_StepperLeapfrog_impl.hpp b/packages/tempus/src/Tempus_StepperLeapfrog_impl.hpp index 4e9cf5a19d65..3c4a8cf09e7b 100644 --- a/packages/tempus/src/Tempus_StepperLeapfrog_impl.hpp +++ b/packages/tempus/src/Tempus_StepperLeapfrog_impl.hpp @@ -189,6 +189,7 @@ void StepperLeapfrog::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); out << std::endl; Stepper::describe(out, verbLevel); StepperExplicit::describe(out, verbLevel); @@ -203,6 +204,7 @@ void StepperLeapfrog::describe( template bool StepperLeapfrog::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; diff --git a/packages/tempus/src/Tempus_StepperNewmarkExplicitAForm_impl.hpp b/packages/tempus/src/Tempus_StepperNewmarkExplicitAForm_impl.hpp index 31ca61c79ea9..5ebe36dcdabc 100644 --- a/packages/tempus/src/Tempus_StepperNewmarkExplicitAForm_impl.hpp +++ b/packages/tempus/src/Tempus_StepperNewmarkExplicitAForm_impl.hpp @@ -343,6 +343,7 @@ void StepperNewmarkExplicitAForm::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); out << std::endl; Stepper::describe(out, verbLevel); StepperExplicit::describe(out, verbLevel); @@ -356,6 +357,7 @@ void StepperNewmarkExplicitAForm::describe( template bool StepperNewmarkExplicitAForm::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; diff --git a/packages/tempus/src/Tempus_StepperNewmarkImplicitAForm_impl.hpp b/packages/tempus/src/Tempus_StepperNewmarkImplicitAForm_impl.hpp index 919314c459d4..8b076752e38d 100644 --- a/packages/tempus/src/Tempus_StepperNewmarkImplicitAForm_impl.hpp +++ b/packages/tempus/src/Tempus_StepperNewmarkImplicitAForm_impl.hpp @@ -549,6 +549,7 @@ void StepperNewmarkImplicitAForm::describe( template bool StepperNewmarkImplicitAForm::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; out.setOutputToRootOnly(0); diff --git a/packages/tempus/src/Tempus_StepperNewmarkImplicitDForm_impl.hpp b/packages/tempus/src/Tempus_StepperNewmarkImplicitDForm_impl.hpp index 96556c93af97..dc69d0fbdfa8 100644 --- a/packages/tempus/src/Tempus_StepperNewmarkImplicitDForm_impl.hpp +++ b/packages/tempus/src/Tempus_StepperNewmarkImplicitDForm_impl.hpp @@ -455,6 +455,7 @@ StepperNewmarkImplicitDForm::describe( *out_ << "DEBUG: " << __PRETTY_FUNCTION__ << "\n"; #endif + out.setOutputToRootOnly(0); out << std::endl; Stepper::describe(out, verbLevel); StepperImplicit::describe(out, verbLevel); @@ -470,6 +471,7 @@ StepperNewmarkImplicitDForm::describe( template bool StepperNewmarkImplicitDForm::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; diff --git a/packages/tempus/src/Tempus_StepperOperatorSplit_decl.hpp b/packages/tempus/src/Tempus_StepperOperatorSplit_decl.hpp index 5af4c130b3d9..e5f3c57c7e23 100644 --- a/packages/tempus/src/Tempus_StepperOperatorSplit_decl.hpp +++ b/packages/tempus/src/Tempus_StepperOperatorSplit_decl.hpp @@ -90,8 +90,7 @@ class StepperOperatorSplit : virtual public Tempus::Stepper virtual void setModel( const Teuchos::RCP >& appModel); - virtual Teuchos::RCP > - getModel(); + virtual Teuchos::RCP > getModel() const; virtual void setSolver( Teuchos::RCP > solver); diff --git a/packages/tempus/src/Tempus_StepperOperatorSplit_impl.hpp b/packages/tempus/src/Tempus_StepperOperatorSplit_impl.hpp index ac3e68e711c9..cd6fef62fe57 100644 --- a/packages/tempus/src/Tempus_StepperOperatorSplit_impl.hpp +++ b/packages/tempus/src/Tempus_StepperOperatorSplit_impl.hpp @@ -85,7 +85,7 @@ void StepperOperatorSplit::setModel( template Teuchos::RCP > -StepperOperatorSplit::getModel() +StepperOperatorSplit::getModel() const { Teuchos::RCP > model; typename std::vector > >::const_iterator @@ -350,6 +350,7 @@ void StepperOperatorSplit::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); out << std::endl; Stepper::describe(out, verbLevel); @@ -375,6 +376,7 @@ void StepperOperatorSplit::describe( template bool StepperOperatorSplit::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; diff --git a/packages/tempus/src/Tempus_StepperRKBase.hpp b/packages/tempus/src/Tempus_StepperRKBase.hpp index 89c133b22af7..b393d608a20d 100644 --- a/packages/tempus/src/Tempus_StepperRKBase.hpp +++ b/packages/tempus/src/Tempus_StepperRKBase.hpp @@ -43,7 +43,13 @@ class StepperRKBase : virtual public Tempus::Stepper virtual int getStageNumber() const { return stageNumber_; } virtual void setStageNumber(int s) { stageNumber_ = s; } - virtual void setUseEmbedded(bool a) { useEmbedded_ = a; } + virtual void setUseEmbedded(bool a) + { + useEmbedded_ = a; + this->setEmbeddedMemory(); + this->isInitialized_ = false; + } + virtual bool getUseEmbedded() const { return useEmbedded_; } virtual void setAppAction(Teuchos::RCP > appAction) @@ -181,6 +187,8 @@ class StepperRKBase : virtual public Tempus::Stepper protected: + virtual void setEmbeddedMemory() {} + Teuchos::RCP > tableau_; // For Embedded RK diff --git a/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_decl.hpp b/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_decl.hpp index 20a86add290c..94432e9695ba 100644 --- a/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_decl.hpp +++ b/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_decl.hpp @@ -82,7 +82,7 @@ class StepperStaggeredForwardSensitivity : //@{ virtual void setModel( const Teuchos::RCP >& appModel); - virtual Teuchos::RCP > getModel(); + virtual Teuchos::RCP > getModel() const; virtual void setSolver( Teuchos::RCP > solver = Teuchos::null); diff --git a/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_impl.hpp b/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_impl.hpp index e4f513d785bf..898b6e417804 100644 --- a/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_impl.hpp +++ b/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_impl.hpp @@ -89,7 +89,7 @@ setModel( template Teuchos::RCP > StepperStaggeredForwardSensitivity:: -getModel() +getModel() const { return combined_fsa_model_; } @@ -274,6 +274,7 @@ describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); out << std::endl; Stepper::describe(out, verbLevel); @@ -296,6 +297,7 @@ describe( template bool StepperStaggeredForwardSensitivity::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; diff --git a/packages/tempus/src/Tempus_StepperState.hpp b/packages/tempus/src/Tempus_StepperState.hpp index 78b1ea236f0e..df2dc59ed091 100644 --- a/packages/tempus/src/Tempus_StepperState.hpp +++ b/packages/tempus/src/Tempus_StepperState.hpp @@ -59,13 +59,19 @@ class StepperState : /// \name Overridden from Teuchos::Describable //@{ - virtual std::string description() const { return "Tempus::StepperState"; } + virtual std::string description() const + { + return "Tempus::StepperState - '" + stepperName_ + "'"; + } virtual void describe(Teuchos::FancyOStream & out, const Teuchos::EVerbosityLevel /* verbLevel */) const { - out << description() << "::describe" << std::endl - << " stepperName = " << stepperName_ << std::endl; + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out,2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << "\n--- " << this->description() << " ---" << std::endl; } //@} diff --git a/packages/tempus/src/Tempus_StepperSubcycling_decl.hpp b/packages/tempus/src/Tempus_StepperSubcycling_decl.hpp index e14f9722e6cf..4112bdafbae0 100644 --- a/packages/tempus/src/Tempus_StepperSubcycling_decl.hpp +++ b/packages/tempus/src/Tempus_StepperSubcycling_decl.hpp @@ -80,7 +80,7 @@ class StepperSubcycling : virtual public Tempus::Stepper const Teuchos::RCP >& appModel); virtual Teuchos::RCP > - getModel(){return scIntegrator_->getStepper()->getModel();} + getModel() const {return scIntegrator_->getStepper()->getModel();} virtual void setAppAction( Teuchos::RCP > appAction = Teuchos::null); diff --git a/packages/tempus/src/Tempus_StepperSubcycling_impl.hpp b/packages/tempus/src/Tempus_StepperSubcycling_impl.hpp index 127bfe9d0310..500547139ba4 100644 --- a/packages/tempus/src/Tempus_StepperSubcycling_impl.hpp +++ b/packages/tempus/src/Tempus_StepperSubcycling_impl.hpp @@ -508,6 +508,7 @@ void StepperSubcycling::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); out << std::endl; Stepper::describe(out, verbLevel); diff --git a/packages/tempus/src/Tempus_StepperTrapezoidal_impl.hpp b/packages/tempus/src/Tempus_StepperTrapezoidal_impl.hpp index e876d52327c6..3e0b01aade0f 100644 --- a/packages/tempus/src/Tempus_StepperTrapezoidal_impl.hpp +++ b/packages/tempus/src/Tempus_StepperTrapezoidal_impl.hpp @@ -188,6 +188,7 @@ void StepperTrapezoidal::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { + out.setOutputToRootOnly(0); out << std::endl; Stepper::describe(out, verbLevel); StepperImplicit::describe(out, verbLevel); @@ -201,6 +202,7 @@ void StepperTrapezoidal::describe( template bool StepperTrapezoidal::isValidSetup(Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if ( !Stepper::isValidSetup(out) ) isValidSetup = false; diff --git a/packages/tempus/src/Tempus_Stepper_decl.hpp b/packages/tempus/src/Tempus_Stepper_decl.hpp index aced12426a84..6dbed0b6ce5f 100644 --- a/packages/tempus/src/Tempus_Stepper_decl.hpp +++ b/packages/tempus/src/Tempus_Stepper_decl.hpp @@ -70,7 +70,7 @@ class Stepper const Teuchos::RCP >& /* appModel */){} #endif - virtual Teuchos::RCP > getModel() + virtual Teuchos::RCP > getModel() const { return Teuchos::null; } /// Set solver. diff --git a/packages/tempus/src/Tempus_Stepper_impl.hpp b/packages/tempus/src/Tempus_Stepper_impl.hpp index 0c6d6392125e..36814abcef9d 100644 --- a/packages/tempus/src/Tempus_Stepper_impl.hpp +++ b/packages/tempus/src/Tempus_Stepper_impl.hpp @@ -120,12 +120,14 @@ Stepper::getStepperXDotDot(Teuchos::RCP > state) template -void Stepper::describe(Teuchos::FancyOStream & in_out, +void Stepper::describe(Teuchos::FancyOStream & out, const Teuchos::EVerbosityLevel verbLevel) const { - auto out = Teuchos::fancyOStream( in_out.getOStream() ); - out->setOutputToRootOnly(0); - *out << "--- Stepper ---\n" + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << "--- Stepper ---\n" << " isInitialized_ = " << Teuchos::toString(isInitialized_) << std::endl << " stepperType_ = " << stepperType_ << std::endl << " useFSAL_ = " << Teuchos::toString(useFSAL_) << std::endl @@ -139,18 +141,19 @@ void Stepper::describe(Teuchos::FancyOStream & in_out, template bool Stepper::isValidSetup( - Teuchos::FancyOStream & in_out) const + Teuchos::FancyOStream & out) const { + out.setOutputToRootOnly(0); bool isValidSetup = true; if ( !(ICConsistency_ == "None" || ICConsistency_ == "Zero" || ICConsistency_ == "App" || ICConsistency_ == "Consistent") ) { isValidSetup = false; - auto out = Teuchos::fancyOStream( in_out.getOStream() ); - out->setOutputToRootOnly(0); - *out << "The IC consistency does not have a valid value!\n" - << "('None', 'Zero', 'App' or 'Consistent')\n" - << " ICConsistency = " << ICConsistency_ << "\n"; + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + l_out->setOutputToRootOnly(0); + *l_out << "The IC consistency does not have a valid value!\n" + << "('None', 'Zero', 'App' or 'Consistent')\n" + << " ICConsistency = " << ICConsistency_ << "\n"; } return isValidSetup; diff --git a/packages/tempus/src/Tempus_TimeEventBase.hpp b/packages/tempus/src/Tempus_TimeEventBase.hpp index bfa8e00038ae..8d86918c77dd 100644 --- a/packages/tempus/src/Tempus_TimeEventBase.hpp +++ b/packages/tempus/src/Tempus_TimeEventBase.hpp @@ -89,6 +89,7 @@ class TimeEventBase { Teuchos::RCP out = Teuchos::VerboseObjectBase::getDefaultOStream(); + out->setOutputToRootOnly(0); *out << "TimeEventBase name = " << getName() << std::endl; } //@} diff --git a/packages/tempus/src/Tempus_TimeEventComposite.hpp b/packages/tempus/src/Tempus_TimeEventComposite.hpp index 15236e24e022..5f9f6878b3e0 100644 --- a/packages/tempus/src/Tempus_TimeEventComposite.hpp +++ b/packages/tempus/src/Tempus_TimeEventComposite.hpp @@ -188,6 +188,7 @@ class TimeEventComposite : virtual public TimeEventBase { Teuchos::RCP out = Teuchos::VerboseObjectBase::getDefaultOStream(); + out->setOutputToRootOnly(0); *out << "TimeEventComposite:" << "\n" << "name = " << this->getName() << "\n" << "Number of TimeEvents = " << timeEvents_.size() << std::endl; diff --git a/packages/tempus/src/Tempus_TimeEventListIndex_impl.hpp b/packages/tempus/src/Tempus_TimeEventListIndex_impl.hpp index a2166b9b01fa..6b82ad5def44 100644 --- a/packages/tempus/src/Tempus_TimeEventListIndex_impl.hpp +++ b/packages/tempus/src/Tempus_TimeEventListIndex_impl.hpp @@ -126,6 +126,7 @@ void TimeEventListIndex::describe() const { Teuchos::RCP out = Teuchos::VerboseObjectBase::getDefaultOStream(); + out->setOutputToRootOnly(0); *out << "TimeEventListIndex:" << "\n" << "name = " << this->getName() << "\n" << "IndexList_ = " << std::endl; diff --git a/packages/tempus/src/Tempus_TimeEventList_impl.hpp b/packages/tempus/src/Tempus_TimeEventList_impl.hpp index 797b28ae7ce3..5151f1306fe9 100644 --- a/packages/tempus/src/Tempus_TimeEventList_impl.hpp +++ b/packages/tempus/src/Tempus_TimeEventList_impl.hpp @@ -176,6 +176,7 @@ void TimeEventList::describe() const { Teuchos::RCP out = Teuchos::VerboseObjectBase::getDefaultOStream(); + out->setOutputToRootOnly(0); *out << "TimeEventList:" << "\n" << "name = " << this->getName() << "\n" << "timeScale_ = " << timeScale_ << "\n" diff --git a/packages/tempus/src/Tempus_TimeEventRangeIndex_impl.hpp b/packages/tempus/src/Tempus_TimeEventRangeIndex_impl.hpp index 0bb6ccb340c4..9dff4f43b2f0 100644 --- a/packages/tempus/src/Tempus_TimeEventRangeIndex_impl.hpp +++ b/packages/tempus/src/Tempus_TimeEventRangeIndex_impl.hpp @@ -138,6 +138,7 @@ void TimeEventRangeIndex::describe() const { Teuchos::RCP out = Teuchos::VerboseObjectBase::getDefaultOStream(); + out->setOutputToRootOnly(0); *out << "TimeEventRange:" << "\n" << "name = " << this->getName() << "\n" << "start_ = " << start_ << "\n" diff --git a/packages/tempus/src/Tempus_TimeEventRange_impl.hpp b/packages/tempus/src/Tempus_TimeEventRange_impl.hpp index ec4c97a10d34..c8c24ed5b2ad 100644 --- a/packages/tempus/src/Tempus_TimeEventRange_impl.hpp +++ b/packages/tempus/src/Tempus_TimeEventRange_impl.hpp @@ -205,6 +205,7 @@ void TimeEventRange::describe() const { Teuchos::RCP out = Teuchos::VerboseObjectBase::getDefaultOStream(); + out->setOutputToRootOnly(0); *out << "TimeEventRange:" << "\n" << "name = " << this->getName() << "\n" << "start_ = " << start_ << "\n" diff --git a/packages/tempus/src/Tempus_TimeStepControlStrategyBasicVS.hpp b/packages/tempus/src/Tempus_TimeStepControlStrategyBasicVS.hpp index 69bedd796255..fa9ac4bad1ae 100644 --- a/packages/tempus/src/Tempus_TimeStepControlStrategyBasicVS.hpp +++ b/packages/tempus/src/Tempus_TimeStepControlStrategyBasicVS.hpp @@ -212,13 +212,21 @@ class TimeStepControlStrategyBasicVS void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override { - Teuchos::OSTab ostab(out,2,"describe"); - out << description() << "::describe:" << std::endl - << "StrategyType = " << this->getStrategyType()<< std::endl - << "Amplification Factor = " << getAmplFactor() << std::endl - << "Reduction Factor = " << getReductFactor() << std::endl - << "Minimum Value Monitoring Function = " << getMinEta() << std::endl - << "Maximum Value Monitoring Function = " << getMaxEta() << std::endl; + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << "\n--- " << this->description() << " ---" << std::endl; + + if (Teuchos::as(verbLevel) >= Teuchos::as(Teuchos::VERB_MEDIUM)) { + *l_out << " StrategyType = " << this->getStrategyType()<< std::endl + << " Step Type = " << this->getStepType() << std::endl + << " Amplification Factor = " << getAmplFactor() << std::endl + << " Reduction Factor = " << getReductFactor() << std::endl + << " Minimum Value Monitoring Function = " << getMinEta() << std::endl + << " Maximum Value Monitoring Function = " << getMaxEta() << std::endl; + *l_out << std::string(this->description().length()+8, '-') <setOutputToRootOnly(0); - Teuchos::OSTab ostab(*out,2,"describe"); - *out << description() << "::describe:" << std::endl - << "Strategy Type = " << this->getStrategyType()<< std::endl - << "Step Type = " << this->getStepType()<< std::endl; - - std::stringstream sList; - for(std::size_t i = 0; i < strategies_.size(); ++i) { - sList << strategies_[i]->getStrategyType(); - if (i < strategies_.size()-1) sList << ", "; - } - *out << "Strategy List = " << sList.str() << std::endl; + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << "\n--- " << this->description() << " ---" << std::endl; - for(auto& s : strategies_) - s->describe(*out, verbLevel); + if (Teuchos::as(verbLevel) >= Teuchos::as(Teuchos::VERB_MEDIUM)) { + *l_out << " Strategy Type = " << this->getStrategyType()<< std::endl + << " Step Type = " << this->getStepType()<< std::endl; + + std::stringstream sList; + for(std::size_t i = 0; i < strategies_.size(); ++i) { + sList << strategies_[i]->getStrategyType(); + if (i < strategies_.size()-1) sList << ", "; + } + *l_out << " Strategy List = " << sList.str() << std::endl; + + for(auto& s : strategies_) + s->describe(*l_out, verbLevel); + + *l_out << std::string(this->description().length()+8, '-') < out = tsc.getOStream(); Teuchos::OSTab ostab(out,1,"setNextTimeStep"); + out->setOutputToRootOnly(0); // Check constant time step @@ -124,11 +125,19 @@ class TimeStepControlStrategyConstant void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override { - Teuchos::OSTab ostab(out,2,"describe"); - out << description() << std::endl - << "Strategy Type = " << this->getStrategyType() << std::endl - << "Step Type = " << this->getStepType() << std::endl - << "Time Step = " << getConstantTimeStep() << std::endl; + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + + *l_out << "\n--- " << this->description() << " ---" << std::endl; + + if (Teuchos::as(verbLevel) >= Teuchos::as(Teuchos::VERB_MEDIUM)) { + *l_out << " Strategy Type = " << this->getStrategyType() << std::endl + << " Step Type = " << this->getStepType() << std::endl + << " Time Step = " << getConstantTimeStep() << std::endl; + + *l_out << std::string(this->description().length()+8, '-') <description() << " ---" << std::endl; + + if (Teuchos::as(verbLevel) >= Teuchos::as(Teuchos::VERB_MEDIUM)) { + *l_out << " Strategy Type = " << this->getStrategyType() << std::endl + << " Step Type = " << this->getStepType() << std::endl + << " Controller Type = " << getController() << std::endl + << " KI = " << getKI() << std::endl + << " KP = " << getKP() << std::endl + << " KD = " << getKD() << std::endl + << " errN_ = " << errN_ << std::endl + << " errNm1_ = " << errNm1_ << std::endl + << " errNm2_ = " << errNm2_ << std::endl + << " Safety Factor = " << getSafetyFactor() << std::endl + << " Safety Factor After Step Rejection = " << getSafetyFactorAfterReject() << std::endl + << " Maximum Safety Factor (INPUT) = " << facMaxINPUT_ << std::endl + << " Maximum Safety Factor = " << getFacMax() << std::endl + << " Minimum Safety Factor = " << getFacMin() << std::endl; + *l_out << std::string(this->description().length()+8, '-') <::describe( Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const { - if (verbLevel == Teuchos::VERB_EXTREME) { + auto l_out = Teuchos::fancyOStream( out.getOStream() ); + Teuchos::OSTab ostab(*l_out, 2, this->description()); + l_out->setOutputToRootOnly(0); + *l_out << "\n--- " << this->description() << " ---" <(verbLevel) >= Teuchos::as(Teuchos::VERB_MEDIUM)) { std::vector idx = getOutputIndices(); std::ostringstream listIdx; if (!idx.empty()) { @@ -472,37 +477,36 @@ void TimeStepControl::describe( std::vector times = getOutputTimes(); std::ostringstream listTimes; if (!times.empty()) { - for(std::size_t i = 0; i < times.size()-1; ++i) listTimes << times[i] << ", "; + for(std::size_t i = 0; i < times.size()-1; ++i) + listTimes << times[i] << ", "; listTimes << times[times.size()-1]; } - auto l_out = Teuchos::fancyOStream( out.getOStream() ); - l_out->setOutputToRootOnly(0); - *l_out << description() << "::describe:" << std::endl - << "stepType = " << getStepType() << std::endl - << "initTime = " << getInitTime() << std::endl - << "finalTime = " << getFinalTime() << std::endl - << "minTimeStep = " << getMinTimeStep() << std::endl - << "initTimeStep = " << getInitTimeStep() << std::endl - << "maxTimeStep = " << getMaxTimeStep() << std::endl - << "initIndex = " << getInitIndex() << std::endl - << "finalIndex = " << getFinalIndex() << std::endl - << "maxAbsError = " << getMaxAbsError() << std::endl - << "maxRelError = " << getMaxRelError() << std::endl - << "maxFailures = " << getMaxFailures() << std::endl - << "maxConsecFailures = " << getMaxConsecFailures() << std::endl - << "numTimeSteps = " << getNumTimeSteps() << std::endl - << "printDtChanges = " << getPrintDtChanges() << std::endl - << "outputExactly = " << getOutputExactly() << std::endl - << "outputIndices = " << listIdx.str() << std::endl - << "outputTimes = " << listTimes.str() << std::endl - << "outputIndexInterval= " << getOutputIndexInterval() << std::endl - << "outputTimeInterval = " << getOutputTimeInterval() << std::endl - << "outputAdjustedDt = " << outputAdjustedDt_ << std::endl - << "dtAfterOutput = " << dtAfterOutput_ << std::endl - << "stepControlSrategy = " << std::endl; - stepControlStrategy_->describe(out, verbLevel); + *l_out << " stepType = " << getStepType() << std::endl + << " initTime = " << getInitTime() << std::endl + << " finalTime = " << getFinalTime() << std::endl + << " minTimeStep = " << getMinTimeStep() << std::endl + << " initTimeStep = " << getInitTimeStep() << std::endl + << " maxTimeStep = " << getMaxTimeStep() << std::endl + << " initIndex = " << getInitIndex() << std::endl + << " finalIndex = " << getFinalIndex() << std::endl + << " maxAbsError = " << getMaxAbsError() << std::endl + << " maxRelError = " << getMaxRelError() << std::endl + << " maxFailures = " << getMaxFailures() << std::endl + << " maxConsecFailures = " << getMaxConsecFailures() << std::endl + << " numTimeSteps = " << getNumTimeSteps() << std::endl + << " printDtChanges = " << getPrintDtChanges() << std::endl + << " outputExactly = " << getOutputExactly() << std::endl + << " outputIndices = " << listIdx.str() << std::endl + << " outputTimes = " << listTimes.str() << std::endl + << " outputIndexInterval= " << getOutputIndexInterval() << std::endl + << " outputTimeInterval = " << getOutputTimeInterval() << std::endl + << " outputAdjustedDt = " << outputAdjustedDt_ << std::endl + << " dtAfterOutput = " << dtAfterOutput_ <describe(*l_out, verbLevel); } + *l_out << std::string(this->description().length()+8, '-') < pl = getParametersFromXmlFile("Tempus_default.xml"); + + // 2) Setup the ModelEvaluator + RCP > model = Teuchos::rcp(new SinCosModel ()); + + // 3) Setup the Integrator + RCP tempusPL = sublist(pl, "Tempus", true); + RCP > integrator = + Tempus::createIntegratorBasic(tempusPL, model); + + std::ostringstream ss; + Teuchos::RCP myOut = + Teuchos::fancyOStream(Teuchos::rcpFromRef(ss)); + + integrator->describe(*myOut, Teuchos::VERB_EXTREME); + + auto testS = ss.str(); + + // Find major headers. + auto npos = std::string::npos; + TEST_ASSERT(npos != testS.find("--- Tempus::IntegratorBasic ---")); + TEST_ASSERT(npos != testS.find("--- Tempus::SolutionHistory")); + TEST_ASSERT(npos != testS.find("--- SolutionState (index = 0; time = 0; dt = 1) ---")); + TEST_ASSERT(npos != testS.find("--- Tempus::SolutionStateMetaData ---")); + TEST_ASSERT(npos != testS.find("--- Tempus::StepperState")); + TEST_ASSERT(npos != testS.find("--- Tempus::PhysicsState")); + TEST_ASSERT(npos != testS.find("--- Tempus::TimeStepControl ---")); + TEST_ASSERT(npos != testS.find("--- Tempus::TimeStepControlStrategyConstant ---")); + TEST_ASSERT(npos != testS.find("--- Stepper ---")); + TEST_ASSERT(npos != testS.find("stepperType_ = Forward Euler")); + TEST_ASSERT(npos != testS.find("--- StepperExplicit ---")); +} + + } // namespace Tempus_Test diff --git a/packages/thyra/adapters/tpetra/src/Thyra_TpetraEuclideanScalarProd_def.hpp b/packages/thyra/adapters/tpetra/src/Thyra_TpetraEuclideanScalarProd_def.hpp index 72ba5ed8a972..7d5e0e4c8018 100644 --- a/packages/thyra/adapters/tpetra/src/Thyra_TpetraEuclideanScalarProd_def.hpp +++ b/packages/thyra/adapters/tpetra/src/Thyra_TpetraEuclideanScalarProd_def.hpp @@ -70,12 +70,6 @@ void TpetraEuclideanScalarProd::scalarPr // in EuclideanScalarProd transposes X... X_tpetra->dot(*Y_tpetra, scalarProds_out); } else { - // If one of the casts succeeded, sync that MV to host space - if (nonnull(X_tpetra)) - Teuchos::rcp_const_cast(X_tpetra)->sync_host (); - if (nonnull(Y_tpetra)) - Teuchos::rcp_const_cast(Y_tpetra)->sync_host (); - EuclideanScalarProd::scalarProdsImpl(X, Y, scalarProds_out); } } diff --git a/packages/thyra/adapters/tpetra/src/Thyra_TpetraLinearOp_def.hpp b/packages/thyra/adapters/tpetra/src/Thyra_TpetraLinearOp_def.hpp index 3f26dcba8249..1e01c63e2d67 100644 --- a/packages/thyra/adapters/tpetra/src/Thyra_TpetraLinearOp_def.hpp +++ b/packages/thyra/adapters/tpetra/src/Thyra_TpetraLinearOp_def.hpp @@ -413,14 +413,16 @@ void TpetraLinearOp::getRowStatImpl( size_t numMyRows = tCrsMatrix->getNodeNumRows(); - Teuchos::ArrayView indices; - Teuchos::ArrayView values; + using crs_t = Tpetra::CrsMatrix; + typename crs_t::local_inds_host_view_type indices; + typename crs_t::values_host_view_type values; + for (size_t row=0; row < numMyRows; ++row) { MT sum = STM::zero (); tCrsMatrix->getLocalRowView (row, indices, values); - for (int col = 0; col < values.size(); ++col) { + for (int col = 0; col < (int) values.size(); ++col) { sum += STS::magnitude (values[col]); } diff --git a/packages/thyra/adapters/tpetra/src/Thyra_TpetraMultiVector_def.hpp b/packages/thyra/adapters/tpetra/src/Thyra_TpetraMultiVector_def.hpp index 93aa0a45e584..8bf18ee68116 100644 --- a/packages/thyra/adapters/tpetra/src/Thyra_TpetraMultiVector_def.hpp +++ b/packages/thyra/adapters/tpetra/src/Thyra_TpetraMultiVector_def.hpp @@ -130,9 +130,6 @@ assignMultiVecImpl(const MultiVectorBase& mv) if (nonnull(tmv)) { tpetraMultiVector_.getNonconstObj()->assign(*tmv); } else { - // This version will require/modify the host view of this vector. - tpetraMultiVector_.getNonconstObj()->sync_host (); - tpetraMultiVector_.getNonconstObj()->modify_host (); MultiVectorDefaultBase::assignMultiVecImpl(mv); } } @@ -160,9 +157,6 @@ void TpetraMultiVector::updateImpl( typedef Teuchos::ScalarTraits ST; tpetraMultiVector_.getNonconstObj()->update(alpha, *tmv, ST::one()); } else { - // This version will require/modify the host view of this vector. - tpetraMultiVector_.getNonconstObj()->sync_host (); - tpetraMultiVector_.getNonconstObj()->modify_host (); MultiVectorDefaultBase::updateImpl(alpha, mv); } } @@ -239,9 +233,6 @@ void TpetraMultiVector::linearCombinatio *alphaIter, *(*tmvIter), *(alphaIter+1), *(*(tmvIter+1)), ST::one()); } } else { - // This version will require/modify the host view of this vector. - tpetraMultiVector_.getNonconstObj()->sync_host (); - tpetraMultiVector_.getNonconstObj()->modify_host (); MultiVectorDefaultBase::linearCombinationImpl(alpha, mv, beta); } } @@ -260,9 +251,6 @@ void TpetraMultiVector::dotsImpl( if (nonnull(tmv)) { tpetraMultiVector_.getConstObj()->dot(*tmv, prods); } else { - // This version will require/modify the host view of this vector. - tpetraMultiVector_.getNonconstObj()->sync_host (); - tpetraMultiVector_.getNonconstObj()->modify_host (); MultiVectorDefaultBase::dotsImpl(mv, prods); } } @@ -459,25 +447,6 @@ mvMultiReductApplyOpImpl( const Ordinal primary_global_offset ) const { - typedef TpetraMultiVector TMV; - - // Sync any non-target Tpetra MVs to host space - for (auto itr = multi_vecs.begin(); itr != multi_vecs.end(); ++itr) { - Ptr tmv = Teuchos::ptr_dynamic_cast(*itr); - if (nonnull(tmv)) { - Teuchos::rcp_const_cast >( - tmv->getConstTpetraMultiVector())-> sync_host (); - } - } - - // Sync any target Tpetra MVs and mark modified - for (auto itr = targ_multi_vecs.begin(); itr != targ_multi_vecs.end(); ++itr) { - Ptr tmv = Teuchos::ptr_dynamic_cast(*itr); - if (nonnull(tmv)) { - tmv->getTpetraMultiVector()->sync_host (); - tmv->getTpetraMultiVector()->modify_host (); - } - } MultiVectorAdapterBase::mvMultiReductApplyOpImpl( primary_op, multi_vecs, targ_multi_vecs, reduct_objs, primary_global_offset); @@ -492,11 +461,6 @@ acquireDetachedMultiVectorViewImpl( RTOpPack::ConstSubMultiVectorView* sub_mv ) const { - // Only viewing data, so just sync dual view to host space - typedef typename Tpetra::MultiVector TMV; - Teuchos::rcp_const_cast( - tpetraMultiVector_.getConstObj())->sync_host (); - SpmdMultiVectorDefaultBase:: acquireDetachedMultiVectorViewImpl(rowRng, colRng, sub_mv); } @@ -510,10 +474,6 @@ acquireNonconstDetachedMultiVectorViewImpl( RTOpPack::SubMultiVectorView* sub_mv ) { - // Sync to host and mark as modified - tpetraMultiVector_.getNonconstObj()->sync_host (); - tpetraMultiVector_.getNonconstObj()->modify_host (); - SpmdMultiVectorDefaultBase:: acquireNonconstDetachedMultiVectorViewImpl(rowRng, colRng, sub_mv); } @@ -528,10 +488,6 @@ commitNonconstDetachedMultiVectorViewImpl( SpmdMultiVectorDefaultBase:: commitNonconstDetachedMultiVectorViewImpl(sub_mv); - // Sync changes from host view to execution space - typedef typename Tpetra::MultiVector< - Scalar,LocalOrdinal,GlobalOrdinal,Node>::execution_space execution_space; - tpetraMultiVector_.getNonconstObj()->template sync(); } @@ -627,13 +583,6 @@ void TpetraMultiVector::euclideanApply( // If the cast succeeded, call Tpetra directly. // Otherwise, fall back to the default implementation. if (nonnull(X_tpetra) && nonnull(Y_tpetra)) { - // Sync everything to the execution space - typedef typename TMV::execution_space execution_space; - Teuchos::rcp_const_cast(X_tpetra)->template sync(); - Y_tpetra->template sync(); - Teuchos::rcp_const_cast( - tpetraMultiVector_.getConstObj())->template sync(); - typedef Teuchos::ScalarTraits ST; TEUCHOS_TEST_FOR_EXCEPTION(ST::isComplex && (M_trans == CONJ), std::logic_error, @@ -655,12 +604,9 @@ void TpetraMultiVector::euclideanApply( break; } - Y_tpetra->template modify(); Y_tpetra->multiply(trans, Teuchos::NO_TRANS, alpha, *tpetraMultiVector_.getConstObj(), *X_tpetra, beta); Kokkos::fence(); } else { - Teuchos::rcp_const_cast( - tpetraMultiVector_.getConstObj())->sync_host (); SpmdMultiVectorDefaultBase::euclideanApply(M_trans, X, Y, alpha, beta); } diff --git a/packages/thyra/adapters/tpetra/src/Thyra_TpetraVector_def.hpp b/packages/thyra/adapters/tpetra/src/Thyra_TpetraVector_def.hpp index 88b93f1b7bc0..57bfb80cbdc3 100644 --- a/packages/thyra/adapters/tpetra/src/Thyra_TpetraVector_def.hpp +++ b/packages/thyra/adapters/tpetra/src/Thyra_TpetraVector_def.hpp @@ -176,9 +176,6 @@ void TpetraVector::absImpl( if (nonnull(tx)) { tpetraVector_.getNonconstObj()->abs(*tx); } else { - // This version will require/modify the host view of this vector. - tpetraVector_.getNonconstObj()->sync_host (); - tpetraVector_.getNonconstObj()->modify_host (); VectorDefaultBase::absImpl(x); } } @@ -196,9 +193,6 @@ void TpetraVector::reciprocalImpl( if (nonnull(tx)) { tpetraVector_.getNonconstObj()->reciprocal(*tx); } else { - // This version will require/modify the host view of this vector. - tpetraVector_.getNonconstObj()->sync_host (); - tpetraVector_.getNonconstObj()->modify_host (); VectorDefaultBase::reciprocalImpl(x); } } @@ -218,9 +212,6 @@ void TpetraVector::eleWiseScaleImpl( tpetraVector_.getNonconstObj()->elementWiseMultiply( ST::one(), *tx, *tpetraVector_.getConstObj(), ST::zero()); } else { - // This version will require/modify the host view of this vector. - tpetraVector_.getNonconstObj()->sync_host (); - tpetraVector_.getNonconstObj()->modify_host (); VectorDefaultBase::eleWiseScaleImpl(x); } } @@ -245,8 +236,6 @@ TpetraVector::norm2WeightedImpl( ST::one(), *tx, *tpetraVector_.getConstObj(), ST::zero()); return ST::magnitude(ST::squareroot(tpetraVector_.getConstObj()->dot(*temp))); } else { - // This version will require the host view of this vector. - tpetraVector_.getNonconstObj()->sync_host (); return VectorDefaultBase::norm2WeightedImpl(x); } } @@ -261,24 +250,6 @@ void TpetraVector::applyOpImpl( const Ordinal global_offset ) const { - // Sync any non-target Tpetra vecs to host space - for (auto itr = vecs.begin(); itr != vecs.end(); ++itr) { - auto tv = this->getConstTpetraVector(Teuchos::rcpFromPtr(*itr)); - if (nonnull(tv)) { - typedef Tpetra::Vector TV; - Teuchos::rcp_const_cast(tv)->sync_host (); - } - } - - // Sync any target Tpetra vecs and mark modified on host - for (auto itr = targ_vecs.begin(); itr != targ_vecs.end(); ++itr) { - auto tv = this->getTpetraVector(Teuchos::rcpFromPtr(*itr)); - if (nonnull(tv)) { - tv->sync_host (); - tv->modify_host (); - } - } - SpmdVectorDefaultBase::applyOpImpl(op, vecs, targ_vecs, reduct_obj, global_offset); } @@ -290,11 +261,6 @@ acquireDetachedVectorViewImpl( RTOpPack::ConstSubVectorView* sub_vec ) const { - // Only viewing data, so just sync dual view to host space - typedef typename Tpetra::Vector TV; - Teuchos::rcp_const_cast( - tpetraVector_.getConstObj())->sync_host (); - SpmdVectorDefaultBase::acquireDetachedVectorViewImpl(rng, sub_vec); } @@ -306,9 +272,6 @@ acquireNonconstDetachedVectorViewImpl( RTOpPack::SubVectorView* sub_vec ) { - // Sync to host and mark as modified - tpetraVector_.getNonconstObj()->sync_host (); - tpetraVector_.getNonconstObj()->modify_host (); SpmdVectorDefaultBase::acquireNonconstDetachedVectorViewImpl(rng, sub_vec); } @@ -321,11 +284,6 @@ commitNonconstDetachedVectorViewImpl( ) { SpmdVectorDefaultBase::commitNonconstDetachedVectorViewImpl(sub_vec); - - // Sync changes from host view to execution space - typedef typename Tpetra::Vector< - Scalar,LocalOrdinal,GlobalOrdinal,Node>::execution_space execution_space; - tpetraVector_.getNonconstObj()->template sync(); } @@ -350,9 +308,6 @@ assignMultiVecImpl(const MultiVectorBase& mv) if (nonnull(tmv)) { tpetraVector_.getNonconstObj()->assign(*tmv); } else { - // This version will require/modify the host view of this vector. - tpetraVector_.getNonconstObj()->sync_host (); - tpetraVector_.getNonconstObj()->modify_host (); MultiVectorDefaultBase::assignMultiVecImpl(mv); } } @@ -379,9 +334,6 @@ void TpetraVector::updateImpl( if (nonnull(tmv)) { tpetraVector_.getNonconstObj()->update(alpha, *tmv, ST::one()); } else { - // This version will require/modify the host view of this vector. - tpetraVector_.getNonconstObj()->sync_host(); - tpetraVector_.getNonconstObj()->modify_host(); MultiVectorDefaultBase::updateImpl(alpha, mv); } } @@ -458,9 +410,6 @@ void TpetraVector::linearCombinationImpl *alphaIter, *(*tmvIter), *(alphaIter+1), *(*(tmvIter+1)), ST::one()); } } else { - // This version will require/modify the host view of this vector. - tpetraVector_.getNonconstObj()->sync_host (); - tpetraVector_.getNonconstObj()->modify_host (); MultiVectorDefaultBase::linearCombinationImpl(alpha, mv, beta); } } @@ -479,9 +428,6 @@ void TpetraVector::dotsImpl( if (nonnull(tmv)) { tpetraVector_.getConstObj()->dot(*tmv, prods); } else { - // This version will require/modify the host view of this vector. - tpetraVector_.getNonconstObj()->sync_host (); - tpetraVector_.getNonconstObj()->modify_host (); MultiVectorDefaultBase::dotsImpl(mv, prods); } } @@ -532,12 +478,6 @@ void TpetraVector::applyImpl( // If the cast succeeded, call Tpetra directly. // Otherwise, fall back to the default implementation. if (nonnull(X_tpetra) && nonnull(Y_tpetra)) { - // Sync everything to the execution space - typedef typename TMV::execution_space execution_space; - Teuchos::rcp_const_cast(X_tpetra)->template sync(); - Y_tpetra->template sync(); - Teuchos::rcp_const_cast(tpetraVector_.getConstObj())->template sync(); - typedef Teuchos::ScalarTraits ST; TEUCHOS_TEST_FOR_EXCEPTION(ST::isComplex && (M_trans == CONJ), std::logic_error, @@ -559,11 +499,9 @@ void TpetraVector::applyImpl( break; } - Y_tpetra->template modify(); Y_tpetra->multiply(trans, Teuchos::NO_TRANS, alpha, *tpetraVector_.getConstObj(), *X_tpetra, beta); Kokkos::fence(); } else { - Teuchos::rcp_const_cast(tpetraVector_.getConstObj())->sync_host (); VectorDefaultBase::applyImpl(M_trans, X, Y, alpha, beta); } diff --git a/packages/thyra/adapters/tpetra/test/Simple2DTpetraModelEvaluator_UnitTests.cpp b/packages/thyra/adapters/tpetra/test/Simple2DTpetraModelEvaluator_UnitTests.cpp index 3f8cf917de9f..016db9a0ba76 100644 --- a/packages/thyra/adapters/tpetra/test/Simple2DTpetraModelEvaluator_UnitTests.cpp +++ b/packages/thyra/adapters/tpetra/test/Simple2DTpetraModelEvaluator_UnitTests.cpp @@ -135,9 +135,9 @@ TEUCHOS_UNIT_TEST_TEMPLATE_1_DECL( Simple2DTpetraModelEvaluator, eval, Scalar ) const RCP > W_tpetra = rcp_dynamic_cast >( ConverterT::getTpetraOperator(W_op)); - - ArrayView row_indices; - ArrayView row_values; + using crs_t = Tpetra::CrsMatrix; + typename crs_t::local_inds_host_view_type row_indices; + typename crs_t::values_host_view_type row_values; W_tpetra->getLocalRowView(0, row_indices, row_values); // FIXME (mfh 22 Oct 2015) This test assumes that local indices diff --git a/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrs.cpp b/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrs.cpp index d71ca755d6de..9b49fa8cd2c9 100644 --- a/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrs.cpp +++ b/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrs.cpp @@ -281,9 +281,7 @@ int main (int argc, char *argv[]) // - all internal views are allocated on device; mirror as mesh database is constructed on host const auto mesh_gids_host = mesh.getElementGlobalIDs(); const auto mesh_gids = - Kokkos::create_mirror_view (typename exec_space::memory_space {}, - mesh.getElementGlobalIDs ()); - Kokkos::deep_copy(mesh_gids, mesh_gids_host); + Kokkos::create_mirror_view_and_copy (typename exec_space::memory_space(), mesh_gids_host); // for convenience, separate the access to owned and remote gids const auto owned_gids = @@ -315,9 +313,9 @@ int main (int argc, char *argv[]) // Graph Construction // ------------------ // local graph is constructed on device space - typedef tpetra_crs_graph_type::local_graph_type local_graph_type; - typedef local_graph_type::row_map_type::non_const_type rowptr_view_type; - typedef typename local_graph_type::entries_type colidx_view_type; + typedef tpetra_crs_graph_type::local_graph_device_type local_graph_device_type; + typedef local_graph_device_type::row_map_type::non_const_type rowptr_view_type; + typedef typename local_graph_device_type::entries_type colidx_view_type; rowptr_view_type rowptr; colidx_view_type colidx; @@ -344,8 +342,7 @@ int main (int argc, char *argv[]) // the last entry of rowptr is the total number of nonzeros in the local graph // mirror to host to use the information in constructing colidx auto nnz = Kokkos::subview(rowptr, num_owned_elements); - const auto nnz_host = Kokkos::create_mirror_view(nnz); - Kokkos::deep_copy(nnz_host, nnz); + const auto nnz_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), nnz); // allocate colidx colidx = colidx_view_type("colidx", nnz_host()); @@ -370,7 +367,14 @@ int main (int argc, char *argv[]) RCP bcrs_graph; { TimeMonitor timerGlobalGraphConstruction(*TimeMonitor::getNewTimer("1) GlobalGraphConstruction")); - bcrs_graph = rcp(new tpetra_crs_graph_type(row_map, col_map, local_graph_type(colidx, rowptr), + rowptr_view_type rowptr_tpetra = + rowptr_view_type(Kokkos::ViewAllocateWithoutInitializing("rowptr_tpetra"), rowptr.extent(0)); + colidx_view_type colidx_tpetra = + colidx_view_type(Kokkos::ViewAllocateWithoutInitializing("colidx_tpetra"), colidx.extent(0)); + Kokkos::deep_copy(rowptr_tpetra, rowptr); + Kokkos::deep_copy(colidx_tpetra, colidx); + bcrs_graph = rcp(new tpetra_crs_graph_type(row_map, col_map, + local_graph_device_type(colidx_tpetra, rowptr_tpetra), Teuchos::null)); } // end global graph timer @@ -398,29 +402,31 @@ int main (int argc, char *argv[]) // Tpetra BlockCrsMatrix only has high level access functions // To fill this on device, we need an access to the meta data of blocks - const auto rowptr_host = Kokkos::create_mirror_view(rowptr); - const auto colidx_host = Kokkos::create_mirror_view(colidx); - - Kokkos::deep_copy(rowptr_host, rowptr); - Kokkos::deep_copy(colidx_host, colidx); + const auto rowptr_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowptr); + const auto colidx_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colidx); blocks = Kokkos::View("blocks", rowptr_host(num_owned_elements)); - - const auto blocks_host = Kokkos::create_mirror_view(blocks); + auto blocks_host = Kokkos::create_mirror_view(Kokkos::HostSpace(), blocks); // This MUST run on host, since it invokes a host-only method, // getLocalBlock. This means we must NOT use KOKKOS_LAMBDA, // since that would build the lambda for both host AND device. - Kokkos::parallel_for - (Kokkos::RangePolicy (0, num_owned_elements), - [&] (const LO row) { - const auto beg = rowptr_host(row); - const auto end = rowptr_host(row+1); - typedef typename std::remove_const::type offset_type; - for (offset_type loc = beg; loc < end; ++loc) { - blocks_host(loc) = A_bcrs->getLocalBlock(row, colidx(loc)); - } - }); + /// without UVM, the getLocalBlockDeviceNonConst cannot be called within the parallel for + /// even though it is host execution space as the method can involve kernel launch + /// for memory transfers. + // Kokkos::parallel_for + // (Kokkos::RangePolicy (0, num_owned_elements), + // [&] (const LO row) { + for (LO row=0;row::type offset_type; + for (offset_type loc = beg; loc < end; ++loc) { + blocks_host(loc) = A_bcrs->getLocalBlockDeviceNonConst(row, colidx_host(loc)); + } + } + // }); + Kokkos::deep_copy(blocks, blocks_host); Kokkos::parallel_for @@ -519,7 +525,7 @@ int main (int argc, char *argv[]) // point-wise row map can be obtained from A_bcrs->getDomainMap(). // A constructor exist for crs matrix with a local matrix and a row map. // see, Tpetra_CrsMatrix_decl.hpp, line 504 - // CrsMatrix (const local_matrix_type& lclMatrix, + // CrsMatrix (const local_matrix_device_type& lclMatrix, // const Teuchos::RCP& rowMap, // const Teuchos::RCP& colMap = Teuchos::null, // const Teuchos::RCP& domainMap = Teuchos::null, @@ -545,7 +551,7 @@ int main (int argc, char *argv[]) rowptr_view_type crs_rowptr = rowptr_view_type("crs_rowptr", num_owned_elements*blocksize+1); colidx_view_type crs_colidx = colidx_view_type("crs_colidx", colidx.extent(0)*blocksize*blocksize); - typename tpetra_crs_matrix_type::local_matrix_type::values_type + typename tpetra_crs_matrix_type::local_matrix_device_type::values_type crs_values("crs_values", colidx.extent(0)*blocksize*blocksize); Kokkos::parallel_for @@ -579,11 +585,11 @@ int main (int argc, char *argv[]) } }); - typename tpetra_crs_matrix_type::local_matrix_type + typename tpetra_crs_matrix_type::local_matrix_device_type local_matrix("local_crs_matrix", num_owned_and_remote_elements*blocksize, crs_values, - local_graph_type(crs_colidx, crs_rowptr)); + local_graph_device_type(crs_colidx, crs_rowptr)); A_crs = rcp(new tpetra_crs_matrix_type(row_crs_map, col_crs_map, diff --git a/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrsMeshDatabase.hpp b/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrsMeshDatabase.hpp index bce848fbb7b7..d66687be669b 100644 --- a/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrsMeshDatabase.hpp +++ b/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrsMeshDatabase.hpp @@ -470,7 +470,7 @@ namespace BlockCrsTest { local_ordinal_range_type _remote_range_j; local_ordinal_range_type _remote_range_k; - typedef typename tpetra_crs_graph_type::local_graph_type::row_map_type::non_const_type rowptr_view_type; + typedef typename tpetra_crs_graph_type::local_graph_device_type::row_map_type::non_const_type rowptr_view_type; rowptr_view_type _rowptr; typedef typename rowptr_view_type::non_const_value_type scan_value_type; @@ -536,8 +536,8 @@ namespace BlockCrsTest { struct LocalGraphFill { private: - typedef typename tpetra_crs_graph_type::local_graph_type::row_map_type::non_const_type rowptr_view_type; - typedef typename tpetra_crs_graph_type::local_graph_type::entries_type colidx_view_type; + typedef typename tpetra_crs_graph_type::local_graph_device_type::row_map_type::non_const_type rowptr_view_type; + typedef typename tpetra_crs_graph_type::local_graph_device_type::entries_type colidx_view_type; MeshDatabase::StructuredBlock _sb; MeshDatabase::global_ordinal_view_type _owned_gids; diff --git a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_Element.hpp b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_Element.hpp index b3f6da5620a9..4b7438e0549c 100644 --- a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_Element.hpp +++ b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_Element.hpp @@ -100,7 +100,6 @@ KOKKOS_INLINE_FUNCTION void ReferenceQuad4RHS(ViewType& rhs) { rhs[i] = static_cast(.25); } -template<> void ReferenceQuad4RHS(Teuchos::Array& rhs) { for(int i=0; (int)i(.25); diff --git a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_InsertGlobalIndices_FE.hpp b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_InsertGlobalIndices_FE.hpp index 8938d7586b98..e54711a3472d 100644 --- a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_InsertGlobalIndices_FE.hpp +++ b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_InsertGlobalIndices_FE.hpp @@ -139,10 +139,10 @@ int executeInsertGlobalIndicesFESP_(const Teuchos::RCP // ----------------- // -- https://trilinos.org/docs/dev/packages/tpetra/doc/html/classTpetra_1_1Map.html#a24490b938e94f8d4f31b6c0e4fc0ff77 RCP row_map = - rcp(new map_type(GO_INVALID, mesh.getOwnedNodeGlobalIDs(), + rcp(new map_type(GO_INVALID, mesh.getOwnedNodeGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm)); RCP owned_plus_shared_map = - rcp(new map_type(GO_INVALID, mesh.getOwnedAndGhostNodeGlobalIDs(), + rcp(new map_type(GO_INVALID, mesh.getOwnedAndGhostNodeGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm)); if(opts.verbose) row_map->describe(out); @@ -156,7 +156,7 @@ int executeInsertGlobalIndicesFESP_(const Teuchos::RCP auto domain_map = row_map; auto range_map = row_map; - auto owned_element_to_node_ids = mesh.getOwnedElementToNode(); + auto owned_element_to_node_ids = mesh.getOwnedElementToNode().getHostView(Tpetra::Access::ReadOnly); Teuchos::TimeMonitor::getStackedTimer()->startBaseTimer(); RCP timerElementLoopGraph = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("1) ElementLoop (Graph)"))); @@ -176,7 +176,8 @@ int executeInsertGlobalIndicesFESP_(const Teuchos::RCP // each row associated with this element's contribution. for(size_t element_node_idx=0; element_node_idx ReferenceQuad4(element_matrix); ReferenceQuad4RHS(element_rhs); - // Fill the global column ids array for this element for (size_t element_node_idx=0; - element_node_idx < owned_element_to_node_ids.extent(1); - ++element_node_idx) { - column_global_ids[element_node_idx] = - owned_element_to_node_ids(element_gidx, element_node_idx); + element_node_idx < owned_element_to_node_ids.extent(1); + ++element_node_idx) { + column_global_ids[element_node_idx] = + owned_element_to_node_ids(element_gidx, element_node_idx); } // For each node (row) on the current element: @@ -275,16 +275,16 @@ int executeInsertGlobalIndicesFESP_(const Teuchos::RCP // - add the values to the fe_matrix. // Note: hardcoded 4 here because we're using quads. for (size_t element_node_idx = 0; element_node_idx < 4; - ++element_node_idx) { - global_ordinal_type global_row_id = - owned_element_to_node_ids(element_gidx, element_node_idx); - - for(size_t col_idx=0; col_idx<4; col_idx++) { - column_scalar_values[col_idx] = element_matrix(element_node_idx, col_idx); - } - - fe_matrix->sumIntoGlobalValues(global_row_id, column_global_ids, column_scalar_values); - rhs->sumIntoGlobalValue(global_row_id, 0, element_rhs[element_node_idx]); + ++element_node_idx) { + global_ordinal_type global_row_id = + owned_element_to_node_ids(element_gidx, element_node_idx); + + for(size_t col_idx=0; col_idx<4; col_idx++) { + column_scalar_values[col_idx] = element_matrix(element_node_idx, col_idx); + } + + fe_matrix->sumIntoGlobalValues(global_row_id, column_global_ids, column_scalar_values); + rhs->sumIntoGlobalValue(global_row_id, 0, element_rhs[element_node_idx]); } } } // timerElementLoopMatrix @@ -357,10 +357,10 @@ int executeInsertGlobalIndicesFESPKokkos_(const Teuchos::RCP row_map = - rcp (new map_type (GO_INVALID, mesh.getOwnedNodeGlobalIDs (), + rcp (new map_type (GO_INVALID, mesh.getOwnedNodeGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm)); RCP owned_plus_shared_map = - rcp (new map_type (GO_INVALID, mesh.getOwnedAndGhostNodeGlobalIDs (), + rcp (new map_type (GO_INVALID, mesh.getOwnedAndGhostNodeGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm)); if (opts.verbose) { @@ -376,7 +376,7 @@ int executeInsertGlobalIndicesFESPKokkos_(const Teuchos::RCPstartBaseTimer(); @@ -471,7 +471,7 @@ int executeInsertGlobalIndicesFESPKokkos_(const Teuchos::RCP rhs = rcp (new fe_multivector_type(domain_map, fe_graph->getImporter(), 1)); - auto localMatrix = fe_matrix->getLocalMatrix(); + auto localMatrix = fe_matrix->getLocalMatrixDevice(); auto localRHS = rhs->getLocalViewDevice(Tpetra::Access::OverwriteAll); auto localMap = owned_plus_shared_map->getLocalMap(); auto localColMap = fe_matrix->getColMap()->getLocalMap(); @@ -482,7 +482,7 @@ int executeInsertGlobalIndicesFESPKokkos_(const Teuchos::RCP > comm, { // NOTE: Elements/nodes are numbered sequentially with x as the "fast" direction - + // NOTE: assembly is all on host, so the overall scopeguard is sufficient here // Get processor decomp information MyRank_ = comm_->getRank(); ij_from_idx(globalProcs_[0],MyRank_,myProcIJ_[0],myProcIJ_[1]); @@ -202,40 +203,43 @@ MeshDatabase::MeshDatabase(Teuchos::RCP > comm, } // Generate the owned element ids - Kokkos::resize(ownedElementGlobalIDs_,num_my_elements); + auto ownedElementGlobalIDs = ownedElementGlobalIDs_.getHostView(Tpetra::Access::ReadWrite); + Kokkos::resize(ownedElementGlobalIDs,num_my_elements); int ect=0; for(global_ordinal_type j=myElementStart_[1]; j > comm, } // NOTE: This are not recorded in Aztec/Ifpack/ML ordering. Because most apps don't do that. - Kokkos::resize(ghostElementGlobalIDs_,my_ghost_elements.size()); - Kokkos::resize(ghostElementToNode_,my_ghost_elements.size()); + auto _ghostElementGlobalIDs = ghostElementGlobalIDs_.getHostView(Tpetra::Access::ReadWrite); + auto _ghostElementToNode = ghostElementToNode_.getHostView(Tpetra::Access::ReadWrite); + Kokkos::resize(_ghostElementGlobalIDs,my_ghost_elements.size()); + Kokkos::resize(_ghostElementToNode,my_ghost_elements.size()); for(size_t k=0; k my_ghost_nodes; - for(size_t k=0; k >& co // Build Tpetra Maps // ----------------- // -- https://trilinos.org/docs/dev/packages/tpetra/doc/html/classTpetra_1_1Map.html#a24490b938e94f8d4f31b6c0e4fc0ff77 - RCP row_map = rcp(new map_type(GO_INVALID, mesh.getOwnedNodeGlobalIDs(), 0, comm)); - RCP owned_element_map = rcp(new map_type(GO_INVALID, mesh.getOwnedElementGlobalIDs(), 0, comm)); - RCP ghost_element_map = rcp(new map_type(GO_INVALID, mesh.getGhostElementGlobalIDs(), 0, comm)); + RCP row_map = rcp(new map_type(GO_INVALID, mesh.getOwnedNodeGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm)); + RCP owned_element_map = rcp(new map_type(GO_INVALID, mesh.getOwnedElementGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm)); + RCP ghost_element_map = rcp(new map_type(GO_INVALID, mesh.getGhostElementGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm)); RCP elementImporter = rcp(new import_type(owned_element_map,ghost_element_map)); if(opts.verbose) row_map->describe(out); @@ -149,8 +149,8 @@ int executeTotalElementLoopSP_(const Teuchos::RCP >& co auto domain_map = row_map; auto range_map = row_map; - auto owned_element_to_node_ids = mesh.getOwnedElementToNode(); - auto ghost_element_to_node_ids = mesh.getGhostElementToNode(); + auto owned_element_to_node_ids = mesh.getOwnedElementToNode().getHostView(Tpetra::Access::ReadOnly); + auto ghost_element_to_node_ids = mesh.getGhostElementToNode().getHostView(Tpetra::Access::ReadOnly); Teuchos::TimeMonitor::getStackedTimer()->startBaseTimer(); RCP timerElementLoopGraph = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("1) ElementLoop (Graph)"))); @@ -183,7 +183,7 @@ int executeTotalElementLoopSP_(const Teuchos::RCP >& co { if(mesh.nodeIsOwned(global_ids_in_row[element_node_idx])) { - crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row()); + crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row()); } } } @@ -199,11 +199,12 @@ int executeTotalElementLoopSP_(const Teuchos::RCP >& co { if(mesh.nodeIsOwned(global_ids_in_row[element_node_idx])) { - crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row()); + crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row()); } } } + timerElementLoopGraph = Teuchos::null; // 'finalize' the crs_graph by calling fillComplete(). @@ -285,10 +286,10 @@ int executeTotalElementLoopSP_(const Teuchos::RCP >& co // Fill the global column ids array for this element for (size_t element_node_idx = 0; - element_node_idx < owned_element_to_node_ids.extent(1); - ++element_node_idx) { + element_node_idx < owned_element_to_node_ids.extent(1); + ++element_node_idx) { column_global_ids[element_node_idx] = - owned_element_to_node_ids(element_gidx, element_node_idx); + owned_element_to_node_ids(element_gidx, element_node_idx); } // For each node (row) on the current element: @@ -296,19 +297,19 @@ int executeTotalElementLoopSP_(const Teuchos::RCP >& co // - add values to crs_matrix if the row is owned. // Note: hardcoded 4 here because we're using quads. for (size_t element_node_idx = 0; element_node_idx < 4; - ++element_node_idx) { + ++element_node_idx) { const global_ordinal_type global_row_id = - owned_element_to_node_ids(element_gidx, element_node_idx); + owned_element_to_node_ids(element_gidx, element_node_idx); if (mesh.nodeIsOwned (global_row_id)) { - for (size_t col_idx = 0; col_idx < 4; ++col_idx) { - column_scalar_values[col_idx] = - element_matrix(element_node_idx, col_idx); - } - crs_matrix.sumIntoGlobalValues (global_row_id, - column_global_ids, - column_scalar_values); - rhs.sumIntoGlobalValue (global_row_id, 0, - element_rhs[element_node_idx]); + for (size_t col_idx = 0; col_idx < 4; ++col_idx) { + column_scalar_values[col_idx] = + element_matrix(element_node_idx, col_idx); + } + crs_matrix.sumIntoGlobalValues (global_row_id, + column_global_ids, + column_scalar_values); + rhs.sumIntoGlobalValue (global_row_id, 0, + element_rhs[element_node_idx]); } } } @@ -331,12 +332,12 @@ int executeTotalElementLoopSP_(const Teuchos::RCP >& co global_ordinal_type global_row_id = ghost_element_to_node_ids(element_gidx, element_node_idx); if(mesh.nodeIsOwned(global_row_id)) { - for(size_t col_idx=0; col_idx<4; col_idx++) + for(size_t col_idx=0; col_idx<4; col_idx++) { - column_scalar_values[col_idx] = element_matrix(element_node_idx, col_idx); - } - crs_matrix.sumIntoGlobalValues(global_row_id, column_global_ids, column_scalar_values); - rhs.sumIntoGlobalValue(global_row_id, 0, element_rhs[element_node_idx]); + column_scalar_values[col_idx] = element_matrix(element_node_idx, col_idx); + } + crs_matrix.sumIntoGlobalValues(global_row_id, column_global_ids, column_scalar_values); + rhs.sumIntoGlobalValue(global_row_id, 0, element_rhs[element_node_idx]); } } } @@ -412,9 +413,9 @@ executeTotalElementLoopSPKokkos_ // ----------------- // -- https://trilinos.org/docs/dev/packages/tpetra/doc/html/classTpetra_1_1Map.html#a24490b938e94f8d4f31b6c0e4fc0ff77 RCP row_map = - rcp(new map_type(GO_INVALID, mesh.getOwnedNodeGlobalIDs(), 0, comm)); - RCP owned_element_map = rcp(new map_type(GO_INVALID, mesh.getOwnedElementGlobalIDs(), 0, comm)); - RCP ghost_element_map = rcp(new map_type(GO_INVALID, mesh.getGhostElementGlobalIDs(), 0, comm)); + rcp(new map_type(GO_INVALID, mesh.getOwnedNodeGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm)); + RCP owned_element_map = rcp(new map_type(GO_INVALID, mesh.getOwnedElementGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm)); + RCP ghost_element_map = rcp(new map_type(GO_INVALID, mesh.getGhostElementGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm)); RCP elementImporter = rcp(new import_type(owned_element_map,ghost_element_map)); if(opts.verbose) row_map->describe(out); @@ -428,8 +429,8 @@ executeTotalElementLoopSPKokkos_ auto domain_map = row_map; auto range_map = row_map; - auto owned_element_to_node_ids = mesh.getOwnedElementToNode(); - auto ghost_element_to_node_ids = mesh.getGhostElementToNode(); + auto owned_element_to_node_ids = mesh.getOwnedElementToNode().getHostView(Tpetra::Access::ReadOnly); + auto ghost_element_to_node_ids = mesh.getGhostElementToNode().getHostView(Tpetra::Access::ReadOnly); Teuchos::TimeMonitor::getStackedTimer()->startBaseTimer(); RCP timerElementLoopGraph = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("1) ElementLoop (Graph)"))); @@ -462,7 +463,7 @@ executeTotalElementLoopSPKokkos_ { if(mesh.nodeIsOwned(global_ids_in_row[element_node_idx])) { - crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row()); + crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row()); } } } @@ -478,7 +479,7 @@ executeTotalElementLoopSPKokkos_ { if(mesh.nodeIsOwned(global_ids_in_row[element_node_idx])) { - crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row()); + crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row()); } } } @@ -548,7 +549,7 @@ executeTotalElementLoopSPKokkos_ RCP crs_matrix = rcp(new crs_matrix_type(crs_graph)); RCP rhs = rcp(new multivector_type(crs_graph->getRowMap(), 1)); - auto localMatrix = crs_matrix->getLocalMatrix(); + auto localMatrix = crs_matrix->getLocalMatrixDevice(); auto localRHS = rhs->getLocalViewDevice(Tpetra::Access::OverwriteAll); auto localRowMap = crs_matrix->getRowMap()->getLocalMap(); auto localColMap = crs_matrix->getColMap()->getLocalMap(); @@ -560,7 +561,7 @@ executeTotalElementLoopSPKokkos_ pair_type alln = pair_type(0,nperel); scalar_2d_array_type all_element_matrix("all_element_matrix",nperel*std::max(numOwnedElements,numGhostElements)); scalar_1d_array_type all_element_rhs("all_element_rhs",nperel*std::max(numOwnedElements,numGhostElements)); - local_ordinal_view_type all_lcids("all_lids",nperel*std::max(numOwnedElements,numGhostElements)); + local_ordinal_single_view_type all_lcids("all_lids",nperel*std::max(numOwnedElements,numGhostElements)); timerElementLoopMemory = Teuchos::null; diff --git a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_typedefs.hpp b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_typedefs.hpp index 5b163784b2db..81b7506db893 100644 --- a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_typedefs.hpp +++ b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_typedefs.hpp @@ -50,12 +50,14 @@ #include "Tpetra_FECrsMatrix.hpp" #include "Tpetra_MultiVector.hpp" #include "Tpetra_FEMultiVector.hpp" +#include "Tpetra_Details_WrappedDualView.hpp" namespace TpetraExamples { +using deviceType = Tpetra::Map<>::device_type; using local_ordinal_type = Tpetra::Map<>::local_ordinal_type; using global_ordinal_type = Tpetra::Map<>::global_ordinal_type; -using execution_space = Tpetra::Map<>::device_type::execution_space; +using execution_space = deviceType::execution_space; using map_type = Tpetra::Map<>; using crs_graph_type = Tpetra::CrsGraph<>; @@ -69,20 +71,32 @@ using export_type = Tpetra::Export<>; using multivector_type = Tpetra::MultiVector; using fe_multivector_type = Tpetra::FEMultiVector; +using globalDualViewType = Kokkos::DualView; +using localDualViewType = Kokkos::DualView; +using scalarDualViewType = Kokkos::DualView; +using global2DArrayDualViewType = Kokkos::DualView; +using local2DArrayDualViewType = Kokkos::DualView; +using scalar2DArrayDualViewType = Kokkos::DualView; +using boolDualViewType = Kokkos::DualView; using global_ordinal_view_type = - Kokkos::View; + Tpetra::Details::WrappedDualView; using local_ordinal_view_type = + Tpetra::Details::WrappedDualView; +using local_ordinal_single_view_type = Kokkos::View; -using scalar_1d_array_type = Kokkos::View; -using bool_1d_array_type = Kokkos::View; +using scalar_1d_array_type = + Kokkos::View; +using bool_1d_array_type = + Tpetra::Details::WrappedDualView; // NOTE: Arrays are hardwired for QUAD4 using local_ordinal_2d_array_type = - Kokkos::View; + Tpetra::Details::WrappedDualView; using global_ordinal_2d_array_type = - Kokkos::View; -using scalar_2d_array_type = Kokkos::View; + Tpetra::Details::WrappedDualView; +using scalar_2d_array_type = + Kokkos::View; } diff --git a/packages/tpetra/core/example/Lesson03-Power-Method/lesson03_power_method.cpp b/packages/tpetra/core/example/Lesson03-Power-Method/lesson03_power_method.cpp index 02e6cf5d1818..f65ac3ef5d3b 100644 --- a/packages/tpetra/core/example/Lesson03-Power-Method/lesson03_power_method.cpp +++ b/packages/tpetra/core/example/Lesson03-Power-Method/lesson03_power_method.cpp @@ -217,6 +217,8 @@ main (int argc, char *argv[]) typedef Tpetra::Vector<>::global_ordinal_type global_ordinal_type; typedef Tpetra::Vector<>::mag_type magnitude_type; typedef Tpetra::CrsMatrix<> crs_matrix_type; + typedef typename crs_matrix_type::nonconst_global_inds_host_view_type gids_type; + typedef typename crs_matrix_type::nonconst_values_host_view_type vals_type; Tpetra::ScopeGuard tpetraScope (&argc, &argv); { @@ -322,8 +324,8 @@ main (int argc, char *argv[]) // the matrix. const global_ordinal_type idOfFirstRow = 0; size_t numEntriesInRow = A->getNumEntriesInGlobalRow (idOfFirstRow); - Array rowvals (numEntriesInRow); - Array rowinds (numEntriesInRow); + vals_type rowvals ("vals",numEntriesInRow); + gids_type rowinds ("gids",numEntriesInRow); // Fill rowvals and rowinds with the values resp. (global) // column indices of the sparse matrix entries owned by the @@ -341,7 +343,7 @@ main (int argc, char *argv[]) // The parentheses after rowinds and rowvalues indicate "a view // of the Array's data." Array::operator() returns an // ArrayView. - A->getGlobalRowCopy (idOfFirstRow, rowinds (), rowvals (), numEntriesInRow); + A->getGlobalRowCopy (idOfFirstRow, rowinds, rowvals, numEntriesInRow); for (size_t i = 0; i < numEntriesInRow; i++) { if (rowinds[i] == idOfFirstRow) { // We have found the diagonal entry; modify it. @@ -354,7 +356,7 @@ main (int argc, char *argv[]) // method throws an exception. If you want to modify the // structure (by adding new entries), you'll need to call // insertGlobalValues(). - A->replaceGlobalValues (idOfFirstRow, rowinds (), rowvals ()); + A->replaceGlobalValues (idOfFirstRow, rowinds, rowvals); } // Call fillComplete() again to signal that we are done changing the diff --git a/packages/tpetra/core/example/Lesson07-Kokkos-Fill/05_solve.cpp b/packages/tpetra/core/example/Lesson07-Kokkos-Fill/05_solve.cpp index 05dc8910235a..ff655facd15c 100644 --- a/packages/tpetra/core/example/Lesson07-Kokkos-Fill/05_solve.cpp +++ b/packages/tpetra/core/example/Lesson07-Kokkos-Fill/05_solve.cpp @@ -237,7 +237,7 @@ int main (int argc, char* argv[]) { // then construct a View of it. (Note that a row offset needs to // have a type that can contain the sum of the row counts.) using row_offset_type = - Tpetra::CrsMatrix::local_matrix_type::row_map_type::non_const_value_type; + Tpetra::CrsMatrix::local_matrix_device_type::row_map_type::non_const_value_type; // Use a parallel scan (prefix sum) over the array of row counts, to // compute the array of row offsets for the sparse graph. diff --git a/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixDenseRowUnpack.cpp b/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixDenseRowUnpack.cpp index 44e7fd875e98..60b818d57308 100644 --- a/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixDenseRowUnpack.cpp +++ b/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixDenseRowUnpack.cpp @@ -233,7 +233,7 @@ RCP