diff --git a/python/taichi/linalg/sparse_matrix.py b/python/taichi/linalg/sparse_matrix.py index 81bc5ef2707e8..e08a8541ca508 100644 --- a/python/taichi/linalg/sparse_matrix.py +++ b/python/taichi/linalg/sparse_matrix.py @@ -243,7 +243,11 @@ def _get_ndarray_addr(self): def print_triplets(self): """Print the triplets stored in the builder""" - self.ptr.print_triplets() + taichi_arch = get_runtime().prog.config().arch + if taichi_arch == _ti_core.Arch.x64 or taichi_arch == _ti_core.Arch.arm64: + self.ptr.print_triplets_eigen() + elif taichi_arch == _ti_core.Arch.cuda: + self.ptr.print_triplets_cuda() def build(self, dtype=f32, _format='CSR'): """Create a sparse matrix using the triplets""" diff --git a/python/taichi/linalg/sparse_solver.py b/python/taichi/linalg/sparse_solver.py index ddf813d81fd90..5085a95b67ba5 100644 --- a/python/taichi/linalg/sparse_solver.py +++ b/python/taichi/linalg/sparse_solver.py @@ -100,32 +100,13 @@ def solve(self, b): # pylint: disable=R1710 return self.solver.solve(b) if isinstance(b, Ndarray): x = ScalarNdarray(b.dtype, [self.matrix.m]) - self.solve_rf(self.matrix, b, x) + self.solver.solve_rf(get_runtime().prog, self.matrix.matrix, b.arr, + x.arr) return x raise TaichiRuntimeError( f"The parameter type: {type(b)} is not supported in linear solvers for now." ) - def solve_cu(self, sparse_matrix, b): - if isinstance(sparse_matrix, SparseMatrix) and isinstance(b, Ndarray): - x = ScalarNdarray(b.dtype, [sparse_matrix.m]) - self.solver.solve_cu(get_runtime().prog, sparse_matrix.matrix, - b.arr, x.arr) - return x - raise TaichiRuntimeError( - f"The parameter type: {type(sparse_matrix)}, {type(b)} and {type(x)} is not supported in linear solvers for now." - ) - - def solve_rf(self, sparse_matrix, b, x): - if isinstance(sparse_matrix, SparseMatrix) and isinstance( - b, Ndarray) and isinstance(x, Ndarray): - self.solver.solve_rf(get_runtime().prog, sparse_matrix.matrix, - b.arr, x.arr) - else: - raise TaichiRuntimeError( - f"The parameter type: {type(sparse_matrix)}, {type(b)} and {type(x)} is not supported in linear solvers for now." - ) - def info(self): """Check if the linear systems are solved successfully. diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp index e631d76bb895f..571cc2e3a0393 100644 --- a/taichi/program/sparse_matrix.cpp +++ b/taichi/program/sparse_matrix.cpp @@ -99,17 +99,53 @@ SparseMatrixBuilder::SparseMatrixBuilder(int rows, prog_, dtype_, std::vector{3 * (int)max_num_triplets_ + 1}); } -void SparseMatrixBuilder::print_triplets() { - num_triplets_ = ndarray_data_base_ptr_->read_int(std::vector{0}); +template +void SparseMatrixBuilder::print_triplets_template() { + auto ptr = get_ndarray_data_ptr(); + G *data = reinterpret_cast(ptr); + num_triplets_ = data[0]; fmt::print("n={}, m={}, num_triplets={} (max={})\n", rows_, cols_, num_triplets_, max_num_triplets_); + data += 1; for (int i = 0; i < num_triplets_; i++) { - auto idx = 3 * i + 1; - auto row = ndarray_data_base_ptr_->read_int(std::vector{idx}); - auto col = ndarray_data_base_ptr_->read_int(std::vector{idx + 1}); - auto val = ndarray_data_base_ptr_->read_float(std::vector{idx + 2}); + fmt::print("[{}, {}] = {}\n", data[i * 3], data[i * 3 + 1], + taichi_union_cast(data[i * 3 + 2])); + } +} + +void SparseMatrixBuilder::print_triplets_eigen() { + auto element_size = data_type_size(dtype_); + switch (element_size) { + case 4: + print_triplets_template(); + break; + case 8: + print_triplets_template(); + break; + default: + TI_ERROR("Unsupported sparse matrix data type!"); + break; + } +} + +void SparseMatrixBuilder::print_triplets_cuda() { +#ifdef TI_WITH_CUDA + CUDADriver::get_instance().memcpy_device_to_host( + &num_triplets_, (void *)get_ndarray_data_ptr(), sizeof(int)); + fmt::print("n={}, m={}, num_triplets={} (max={})\n", rows_, cols_, + num_triplets_, max_num_triplets_); + auto len = 3 * num_triplets_ + 1; + std::vector trips(len); + CUDADriver::get_instance().memcpy_device_to_host( + (void *)trips.data(), (void *)get_ndarray_data_ptr(), + len * sizeof(float32)); + for (auto i = 0; i < num_triplets_; i++) { + int row = taichi_union_cast(trips[3 * i + 1]); + int col = taichi_union_cast(trips[3 * i + 2]); + auto val = trips[i * 3 + 3]; fmt::print("[{}, {}] = {}\n", row, col, val); } +#endif } intptr_t SparseMatrixBuilder::get_ndarray_data_ptr() const { diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h index c79f4f0679784..c3d8b261a28d4 100644 --- a/taichi/program/sparse_matrix.h +++ b/taichi/program/sparse_matrix.h @@ -22,7 +22,8 @@ class SparseMatrixBuilder { const std::string &storage_format, Program *prog); - void print_triplets(); + void print_triplets_eigen(); + void print_triplets_cuda(); intptr_t get_ndarray_data_ptr() const; @@ -36,6 +37,9 @@ class SparseMatrixBuilder { template void build_template(std::unique_ptr &); + template + void print_triplets_template(); + private: uint64 num_triplets_{0}; std::unique_ptr ndarray_data_base_ptr_{nullptr}; diff --git a/taichi/program/sparse_solver.cpp b/taichi/program/sparse_solver.cpp index a0c43e32671f2..71e34e6d6397a 100644 --- a/taichi/program/sparse_solver.cpp +++ b/taichi/program/sparse_solver.cpp @@ -176,6 +176,10 @@ INSTANTIATE_LU_SOLVE_RF(float64, LU, AMD, Eigen::VectorXd) INSTANTIATE_LU_SOLVE_RF(float64, LU, COLAMD, Eigen::VectorXd) CuSparseSolver::CuSparseSolver() { + init_solver(); +} + +void CuSparseSolver::init_solver() { #if defined(TI_WITH_CUDA) if (!CUSPARSEDriver::get_instance().is_loaded()) { bool load_success = CUSPARSEDriver::get_instance().load_cusparse(); @@ -191,19 +195,14 @@ CuSparseSolver::CuSparseSolver() { } #endif } -// Reference: -// https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/cuSolverSp_LowlevelCholesky.cpp -void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) { +void CuSparseSolver::reorder(const CuSparseMatrix &A) { #if defined(TI_WITH_CUDA) - // Retrive the info of the sparse matrix - SparseMatrix *sm_no_cv = const_cast(&sm); - CuSparseMatrix *A = dynamic_cast(sm_no_cv); - size_t rowsA = A->num_rows(); - size_t colsA = A->num_cols(); - size_t nnzA = A->get_nnz(); - void *d_csrRowPtrA = A->get_row_ptr(); - void *d_csrColIndA = A->get_col_ind(); - void *d_csrValA = A->get_val_ptr(); + size_t rowsA = A.num_rows(); + size_t colsA = A.num_cols(); + size_t nnzA = A.get_nnz(); + void *d_csrRowPtrA = A.get_row_ptr(); + void *d_csrColIndA = A.get_col_ind(); + void *d_csrValA = A.get_val_ptr(); CUSOLVERDriver::get_instance().csSpCreate(&cusolver_handle_); CUSPARSEDriver::get_instance().cpCreate(&cusparse_handel_); CUSPARSEDriver::get_instance().cpCreateMatDescr(&descr_); @@ -211,80 +210,143 @@ void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) { CUSPARSE_MATRIX_TYPE_GENERAL); CUSPARSEDriver::get_instance().cpSetMatIndexBase(descr_, CUSPARSE_INDEX_BASE_ZERO); - - // step 1: reorder the sparse matrix float *h_csrValA = nullptr; - h_Q = (int *)malloc(sizeof(int) * colsA); - h_csrRowPtrB = (int *)malloc(sizeof(int) * (rowsA + 1)); - h_csrColIndB = (int *)malloc(sizeof(int) * nnzA); - h_csrValB = (float *)malloc(sizeof(float) * nnzA); + h_Q_ = (int *)malloc(sizeof(int) * colsA); + h_csrRowPtrB_ = (int *)malloc(sizeof(int) * (rowsA + 1)); + h_csrColIndB_ = (int *)malloc(sizeof(int) * nnzA); + h_csrValB_ = (float *)malloc(sizeof(float) * nnzA); h_csrValA = (float *)malloc(sizeof(float) * nnzA); - h_mapBfromA = (int *)malloc(sizeof(int) * nnzA); - assert(nullptr != h_Q); - assert(nullptr != h_csrRowPtrB); - assert(nullptr != h_csrColIndB); - assert(nullptr != h_csrValB); - assert(nullptr != h_mapBfromA); - - CUDADriver::get_instance().memcpy_device_to_host(h_csrRowPtrB, d_csrRowPtrA, + h_mapBfromA_ = (int *)malloc(sizeof(int) * nnzA); + assert(nullptr != h_Q_); + assert(nullptr != h_csrRowPtrB_); + assert(nullptr != h_csrColIndB_); + assert(nullptr != h_csrValB_); + assert(nullptr != h_mapBfromA_); + + CUDADriver::get_instance().memcpy_device_to_host(h_csrRowPtrB_, d_csrRowPtrA, sizeof(int) * (rowsA + 1)); - CUDADriver::get_instance().memcpy_device_to_host(h_csrColIndB, d_csrColIndA, + CUDADriver::get_instance().memcpy_device_to_host(h_csrColIndB_, d_csrColIndA, sizeof(int) * nnzA); CUDADriver::get_instance().memcpy_device_to_host(h_csrValA, d_csrValA, sizeof(float) * nnzA); - // compoute h_Q - CUSOLVERDriver::get_instance().csSpXcsrsymamdHost( - cusolver_handle_, rowsA, nnzA, descr_, h_csrRowPtrB, h_csrColIndB, h_Q); - CUDADriver::get_instance().malloc((void **)&d_Q, sizeof(int) * colsA); - CUDADriver::get_instance().memcpy_host_to_device((void *)d_Q, (void *)h_Q, + // compoute h_Q_ + CUSOLVERDriver::get_instance().csSpXcsrsymamdHost(cusolver_handle_, rowsA, + nnzA, descr_, h_csrRowPtrB_, + h_csrColIndB_, h_Q_); + CUDADriver::get_instance().malloc((void **)&d_Q_, sizeof(int) * colsA); + CUDADriver::get_instance().memcpy_host_to_device((void *)d_Q_, (void *)h_Q_, sizeof(int) * (colsA)); size_t size_perm = 0; CUSOLVERDriver::get_instance().csSpXcsrperm_bufferSizeHost( - cusolver_handle_, rowsA, colsA, nnzA, descr_, h_csrRowPtrB, h_csrColIndB, - h_Q, h_Q, &size_perm); + cusolver_handle_, rowsA, colsA, nnzA, descr_, h_csrRowPtrB_, + h_csrColIndB_, h_Q_, h_Q_, &size_perm); void *buffer_cpu = (void *)malloc(sizeof(char) * size_perm); assert(nullptr != buffer_cpu); for (int j = 0; j < nnzA; j++) { - h_mapBfromA[j] = j; + h_mapBfromA_[j] = j; } CUSOLVERDriver::get_instance().csSpXcsrpermHost( - cusolver_handle_, rowsA, colsA, nnzA, descr_, h_csrRowPtrB, h_csrColIndB, - h_Q, h_Q, h_mapBfromA, buffer_cpu); + cusolver_handle_, rowsA, colsA, nnzA, descr_, h_csrRowPtrB_, + h_csrColIndB_, h_Q_, h_Q_, h_mapBfromA_, buffer_cpu); // B = A( mapBfromA ) for (int j = 0; j < nnzA; j++) { - h_csrValB[j] = h_csrValA[h_mapBfromA[j]]; + h_csrValB_[j] = h_csrValA[h_mapBfromA_[j]]; } - CUDADriver::get_instance().malloc((void **)&d_csrRowPtrB, + CUDADriver::get_instance().malloc((void **)&d_csrRowPtrB_, sizeof(int) * (rowsA + 1)); - CUDADriver::get_instance().malloc((void **)&d_csrColIndB, sizeof(int) * nnzA); - CUDADriver::get_instance().malloc((void **)&d_csrValB, sizeof(float) * nnzA); + CUDADriver::get_instance().malloc((void **)&d_csrColIndB_, + sizeof(int) * nnzA); + CUDADriver::get_instance().malloc((void **)&d_csrValB_, sizeof(float) * nnzA); CUDADriver::get_instance().memcpy_host_to_device( - (void *)d_csrRowPtrB, (void *)h_csrRowPtrB, sizeof(int) * (rowsA + 1)); + (void *)d_csrRowPtrB_, (void *)h_csrRowPtrB_, sizeof(int) * (rowsA + 1)); CUDADriver::get_instance().memcpy_host_to_device( - (void *)d_csrColIndB, (void *)h_csrColIndB, sizeof(int) * nnzA); + (void *)d_csrColIndB_, (void *)h_csrColIndB_, sizeof(int) * nnzA); CUDADriver::get_instance().memcpy_host_to_device( - (void *)d_csrValB, (void *)h_csrValB, sizeof(float) * nnzA); + (void *)d_csrValB_, (void *)h_csrValB_, sizeof(float) * nnzA); free(h_csrValA); free(buffer_cpu); +#endif +} + +// Reference: +// https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/cuSolverSp_LowlevelCholesky.cpp +void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) { + switch (solver_type_) { + case SolverType::Cholesky: + analyze_pattern_cholesky(sm); + break; + case SolverType::LU: + analyze_pattern_lu(sm); + break; + default: + TI_NOT_IMPLEMENTED + } +} +void CuSparseSolver::analyze_pattern_cholesky(const SparseMatrix &sm) { +#if defined(TI_WITH_CUDA) + // Retrive the info of the sparse matrix + SparseMatrix &sm_no_cv = const_cast(sm); + CuSparseMatrix &A = static_cast(sm_no_cv); + + // step 1: reorder the sparse matrix + reorder(A); // step 2: create opaque info structure CUSOLVERDriver::get_instance().csSpCreateCsrcholInfo(&info_); // step 3: analyze chol(A) to know structure of L + size_t rowsA = A.num_rows(); + size_t nnzA = A.get_nnz(); CUSOLVERDriver::get_instance().csSpXcsrcholAnalysis( - cusolver_handle_, rowsA, nnzA, descr_, d_csrRowPtrB, d_csrColIndB, info_); + cusolver_handle_, rowsA, nnzA, descr_, d_csrRowPtrB_, d_csrColIndB_, + info_); is_analyzed_ = true; #else TI_NOT_IMPLEMENTED #endif } +void CuSparseSolver::analyze_pattern_lu(const SparseMatrix &sm) { +#if defined(TI_WITH_CUDA) + // Retrive the info of the sparse matrix + SparseMatrix &sm_no_cv = const_cast(sm); + CuSparseMatrix &A = static_cast(sm_no_cv); + // step 1: reorder the sparse matrix + reorder(A); + + // step 2: create opaque info structure + CUSOLVERDriver::get_instance().csSpCreateCsrluInfoHost(&lu_info_); + + // step 3: analyze LU(B) to know structure of Q and R, and upper bound for + // nnz(L+U) + size_t rowsA = A.num_rows(); + size_t nnzA = A.get_nnz(); + CUSOLVERDriver::get_instance().csSpXcsrluAnalysisHost( + cusolver_handle_, rowsA, nnzA, descr_, h_csrRowPtrB_, h_csrColIndB_, + lu_info_); + is_analyzed_ = true; +#else + TI_NOT_IMPLEMENTED +#endif +} void CuSparseSolver::factorize(const SparseMatrix &sm) { + switch (solver_type_) { + case SolverType::Cholesky: + factorize_cholesky(sm); + break; + case SolverType::LU: + factorize_lu(sm); + break; + default: + TI_NOT_IMPLEMENTED + } +} +void CuSparseSolver::factorize_cholesky(const SparseMatrix &sm) { #if defined(TI_WITH_CUDA) // Retrive the info of the sparse matrix SparseMatrix *sm_no_cv = const_cast(&sm); - CuSparseMatrix *A = dynamic_cast(sm_no_cv); + CuSparseMatrix *A = static_cast(sm_no_cv); size_t rowsA = A->num_rows(); size_t nnzA = A->get_nnz(); @@ -292,16 +354,16 @@ void CuSparseSolver::factorize(const SparseMatrix &sm) { size_t size_chol = 0; // size of working space for csrlu // step 1: workspace for chol(A) CUSOLVERDriver::get_instance().csSpScsrcholBufferInfo( - cusolver_handle_, rowsA, nnzA, descr_, d_csrValB, d_csrRowPtrB, - d_csrColIndB, info_, &size_internal, &size_chol); + cusolver_handle_, rowsA, nnzA, descr_, d_csrValB_, d_csrRowPtrB_, + d_csrColIndB_, info_, &size_internal, &size_chol); if (size_chol > 0) CUDADriver::get_instance().malloc(&gpu_buffer_, sizeof(char) * size_chol); // step 2: compute A = L*L^T CUSOLVERDriver::get_instance().csSpScsrcholFactor( - cusolver_handle_, rowsA, nnzA, descr_, d_csrValB, d_csrRowPtrB, - d_csrColIndB, info_, gpu_buffer_); + cusolver_handle_, rowsA, nnzA, descr_, d_csrValB_, d_csrRowPtrB_, + d_csrColIndB_, info_, gpu_buffer_); // step 3: check if the matrix is singular const float tol = 1.e-14; int singularity = 0; @@ -313,176 +375,61 @@ void CuSparseSolver::factorize(const SparseMatrix &sm) { TI_NOT_IMPLEMENTED #endif } - -void CuSparseSolver::solve_cu(Program *prog, - const SparseMatrix &sm, - const Ndarray &b, - const Ndarray &x) { -#ifdef TI_WITH_CUDA - cusparseHandle_t cusparseHandle = nullptr; - CUSPARSEDriver::get_instance().cpCreate(&cusparseHandle); - cusolverSpHandle_t handle = nullptr; - CUSOLVERDriver::get_instance().csSpCreate(&handle); - - int major_version, minor_version, patch_level; - CUSOLVERDriver::get_instance().csGetProperty(MAJOR_VERSION, &major_version); - CUSOLVERDriver::get_instance().csGetProperty(MINOR_VERSION, &minor_version); - CUSOLVERDriver::get_instance().csGetProperty(PATCH_LEVEL, &patch_level); - printf("Cusolver version: %d.%d.%d\n", major_version, minor_version, - patch_level); - +void CuSparseSolver::factorize_lu(const SparseMatrix &sm) { +#if defined(TI_WITH_CUDA) // Retrive the info of the sparse matrix SparseMatrix *sm_no_cv = const_cast(&sm); - CuSparseMatrix *A = dynamic_cast(sm_no_cv); - size_t nrows = A->num_rows(); - size_t ncols = A->num_cols(); - size_t nnz = A->get_nnz(); - void *drow_offsets = A->get_row_ptr(); - void *dcol_indices = A->get_col_ind(); - void *dvalues = A->get_val_ptr(); - - size_t db = prog->get_ndarray_data_ptr_as_int(&b); - size_t dx = prog->get_ndarray_data_ptr_as_int(&x); - - float *h_z = (float *)malloc(sizeof(float) * ncols); - float *h_x = (float *)malloc(sizeof(float) * ncols); - float *h_b = (float *)malloc(sizeof(float) * nrows); - float *h_Qb = (float *)malloc(sizeof(float) * nrows); - - int *h_Q = (int *)malloc(sizeof(int) * ncols); - int *hrow_offsets_B = (int *)malloc(sizeof(int) * (nrows + 1)); - int *hcol_indices_B = (int *)malloc(sizeof(int) * nnz); - float *h_val_B = (float *)malloc(sizeof(float) * nnz); - int *h_mapBfromA = (int *)malloc(sizeof(int) * nnz); - - assert(nullptr != h_z); - assert(nullptr != h_x); - assert(nullptr != h_b); - assert(nullptr != h_Qb); - assert(nullptr != h_Q); - assert(nullptr != hrow_offsets_B); - assert(nullptr != hcol_indices_B); - assert(nullptr != h_val_B); - assert(nullptr != h_mapBfromA); - - int *hrow_offsets = nullptr, *hcol_indices = nullptr; - hrow_offsets = (int *)malloc(sizeof(int) * (nrows + 1)); - hcol_indices = (int *)malloc(sizeof(int) * nnz); - assert(nullptr != hrow_offsets); - assert(nullptr != hcol_indices); - // Attention: drow_offsets is not freed at other palces - CUDADriver::get_instance().memcpy_device_to_host( - (void *)hrow_offsets, drow_offsets, sizeof(int) * (nrows + 1)); - CUDADriver::get_instance().memcpy_device_to_host( - (void *)hcol_indices, dcol_indices, sizeof(int) * nnz); - - /* configure matrix descriptor*/ - cusparseMatDescr_t descrA = nullptr; - CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrA); - CUSPARSEDriver::get_instance().cpSetMatType(descrA, - CUSPARSE_MATRIX_TYPE_GENERAL); - CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrA, - CUSPARSE_INDEX_BASE_ZERO); - int issym = 0; - CUSOLVERDriver::get_instance().csSpXcsrissymHost( - handle, nrows, nnz, descrA, (void *)hrow_offsets, - (void *)(hrow_offsets + 1), (void *)hcol_indices, &issym); - if (!issym) { - TI_ERROR("A has no symmetric pattern, please use LU or QR!"); - return; - } - - // step 2:reorder the matrix to minimize zero fill-in - CUSOLVERDriver::get_instance().csSpXcsrsymrcmHost( - handle, nrows, nnz, descrA, (void *)hrow_offsets, (void *)hcol_indices, - (void *)h_Q); // symrcm method - - // B = A(Q, Q) - memcpy(hrow_offsets_B, hrow_offsets, sizeof(int) * (nrows + 1)); - memcpy(hcol_indices_B, hcol_indices, sizeof(int) * nnz); - - size_t size_perm; - CUSOLVERDriver::get_instance().csSpXcsrperm_bufferSizeHost( - handle, nrows, ncols, nnz, descrA, (void *)hrow_offsets_B, - (void *)hcol_indices_B, (void *)h_Q, (void *)h_Q, &size_perm); - void *buffer_cpu = (void *)malloc(sizeof(char) * size_perm); - for (int j = 0; j < nnz; j++) - h_mapBfromA[j] = j; - - CUSOLVERDriver::get_instance().csSpXcsrpermHost( - handle, nrows, ncols, nnz, descrA, (void *)hrow_offsets_B, - (void *)hcol_indices_B, (void *)h_Q, (void *)h_Q, (void *)h_mapBfromA, - (void *)buffer_cpu); - - float *h_val_A = (float *)malloc(sizeof(float) * nnz); - CUDADriver::get_instance().memcpy_device_to_host( - (void *)h_val_A, (void *)dvalues, sizeof(int) * nnz); - for (int j = 0; j < nnz; j++) - h_val_B[j] = h_val_A[h_mapBfromA[j]]; - - CUDADriver::get_instance().memcpy_device_to_host((void *)h_b, (void *)db, - sizeof(float) * nrows); - for (int row = 0; row < nrows; row++) { - h_Qb[row] = h_b[h_Q[row]]; - } - - // step 4: solve B*z = Q*b - float tol = 1e-6; - int reorder = 1; - int singularity = 0; /* -1 if A is invertible under tol. */ - // use Cholesky decomposition as defualt - CUSOLVERDriver::get_instance().csSpScsrlsvcholHost( - handle, nrows, nnz, descrA, (void *)h_val_B, (void *)hrow_offsets_B, - (void *)hcol_indices_B, (void *)h_Qb, tol, reorder, (void *)h_z, - &singularity); - if (!singularity) { - TI_ERROR("A is a sigular matrix!"); - return; - } - - // step 5: Q*x = z - for (int row = 0; row < nrows; row++) - h_x[h_Q[row]] = h_z[row]; - - CUDADriver::get_instance().memcpy_host_to_device((void *)dx, (void *)h_x, - sizeof(float) * ncols); - - CUSOLVERDriver::get_instance().csSpDestory(handle); - CUSPARSEDriver::get_instance().cpDestroy(cusparseHandle); - - if (hrow_offsets != nullptr) - free(hrow_offsets); - if (hcol_indices != nullptr) - free(hcol_indices); - if (hrow_offsets_B != nullptr) - free(hrow_offsets_B); - if (hcol_indices_B != nullptr) - free(hcol_indices_B); - if (h_Q != nullptr) - free(h_Q); - if (h_mapBfromA != nullptr) - free(h_mapBfromA); - if (h_z != nullptr) - free(h_z); - if (h_b != nullptr) - free(h_b); - if (h_Qb != nullptr) - free(h_Qb); - if (h_x != nullptr) - free(h_x); - if (buffer_cpu != nullptr) - free(buffer_cpu); - if (h_val_A != nullptr) - free(h_val_A); - if (h_val_B != nullptr) - free(h_val_B); + CuSparseMatrix *A = static_cast(sm_no_cv); + size_t rowsA = A->num_rows(); + size_t nnzA = A->get_nnz(); + // step 4: workspace for LU(B) + size_t size_lu = 0; + size_t buffer_size = 0; + CUSOLVERDriver::get_instance().csSpScsrluBufferInfoHost( + cusolver_handle_, rowsA, nnzA, descr_, h_csrValB_, h_csrRowPtrB_, + h_csrColIndB_, lu_info_, &buffer_size, &size_lu); + + if (cpu_buffer_) + free(cpu_buffer_); + cpu_buffer_ = (void *)malloc(sizeof(char) * size_lu); + assert(nullptr != cpu_buffer_); + + // step 5: compute Ppivot * B = L * U + CUSOLVERDriver::get_instance().csSpScsrluFactorHost( + cusolver_handle_, rowsA, nnzA, descr_, h_csrValB_, h_csrRowPtrB_, + h_csrColIndB_, lu_info_, 1.0f, cpu_buffer_); + + // step 6: check singularity by tol + int singularity = 0; + const float tol = 1.e-6; + CUSOLVERDriver::get_instance().csSpScsrluZeroPivotHost( + cusolver_handle_, lu_info_, tol, &singularity); + TI_ASSERT(singularity == -1); + is_factorized_ = true; +#else + TI_NOT_IMPLEMENTED #endif } - void CuSparseSolver::solve_rf(Program *prog, const SparseMatrix &sm, const Ndarray &b, const Ndarray &x) { + switch (solver_type_) { + case SolverType::Cholesky: + solve_cholesky(prog, sm, b, x); + break; + case SolverType::LU: + solve_lu(prog, sm, b, x); + break; + default: + TI_NOT_IMPLEMENTED + } +} + +void CuSparseSolver::solve_cholesky(Program *prog, + const SparseMatrix &sm, + const Ndarray &b, + const Ndarray &x) { #if defined(TI_WITH_CUDA) if (is_analyzed_ == false) { analyze_pattern(sm); @@ -492,7 +439,7 @@ void CuSparseSolver::solve_rf(Program *prog, } // Retrive the info of the sparse matrix SparseMatrix *sm_no_cv = const_cast(&sm); - CuSparseMatrix *A = dynamic_cast(sm_no_cv); + CuSparseMatrix *A = static_cast(sm_no_cv); size_t rowsA = A->num_rows(); size_t colsA = A->num_cols(); size_t d_b = prog->get_ndarray_data_ptr_as_int(&b); @@ -506,7 +453,7 @@ void CuSparseSolver::solve_rf(Program *prog, CUSPARSEDriver::get_instance().cpCreateDnVec(&vec_b, (int)rowsA, (void *)d_b, CUDA_R_32F); CUSPARSEDriver::get_instance().cpCreateSpVec( - &vec_Qb, (int)rowsA, (int)rowsA, d_Q, d_Qb, CUSPARSE_INDEX_32I, + &vec_Qb, (int)rowsA, (int)rowsA, d_Q_, d_Qb, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); CUSPARSEDriver::get_instance().cpGather(cusparse_handel_, vec_b, vec_Qb); @@ -520,7 +467,7 @@ void CuSparseSolver::solve_rf(Program *prog, cusparseSpVecDescr_t vecX; cusparseDnVecDescr_t vecY; CUSPARSEDriver::get_instance().cpCreateSpVec( - &vecX, (int)colsA, (int)colsA, d_Q, d_z, CUSPARSE_INDEX_32I, + &vecX, (int)colsA, (int)colsA, d_Q_, d_z, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); CUSPARSEDriver::get_instance().cpCreateDnVec(&vecY, (int)colsA, (void *)d_x, CUDA_R_32F); @@ -539,6 +486,59 @@ void CuSparseSolver::solve_rf(Program *prog, #endif } +void CuSparseSolver::solve_lu(Program *prog, + const SparseMatrix &sm, + const Ndarray &b, + const Ndarray &x) { +#if defined(TI_WITH_CUDA) + if (is_analyzed_ == false) { + analyze_pattern(sm); + } + if (is_factorized_ == false) { + factorize(sm); + } + + // Retrive the info of the sparse matrix + SparseMatrix *sm_no_cv = const_cast(&sm); + CuSparseMatrix *A = static_cast(sm_no_cv); + size_t rowsA = A->num_rows(); + size_t colsA = A->num_cols(); + + // step 7: solve L*U*x = b + size_t d_b = prog->get_ndarray_data_ptr_as_int(&b); + size_t d_x = prog->get_ndarray_data_ptr_as_int(&x); + float *h_b = (float *)malloc(sizeof(float) * rowsA); + float *h_b_hat = (float *)malloc(sizeof(float) * rowsA); + float *h_x = (float *)malloc(sizeof(float) * colsA); + float *h_x_hat = (float *)malloc(sizeof(float) * colsA); + assert(nullptr != h_b); + assert(nullptr != h_b_hat); + assert(nullptr != h_x); + assert(nullptr != h_x_hat); + CUDADriver::get_instance().memcpy_device_to_host((void *)h_b, (void *)d_b, + sizeof(float) * rowsA); + CUDADriver::get_instance().memcpy_device_to_host((void *)h_x, (void *)d_x, + sizeof(float) * colsA); + for (int j = 0; j < rowsA; j++) { + h_b_hat[j] = h_b[h_Q_[j]]; + } + CUSOLVERDriver::get_instance().csSpScsrluSolveHost( + cusolver_handle_, rowsA, h_b_hat, h_x_hat, lu_info_, cpu_buffer_); + for (int j = 0; j < colsA; j++) { + h_x[h_Q_[j]] = h_x_hat[j]; + } + CUDADriver::get_instance().memcpy_host_to_device((void *)d_x, (void *)h_x, + sizeof(float) * colsA); + + free(h_b); + free(h_b_hat); + free(h_x); + free(h_x_hat); +#else + TI_NOT_IMPLEMENTED +#endif +} + std::unique_ptr make_sparse_solver(DataType dt, const std::string &solver_type, const std::string &ordering) { @@ -579,18 +579,22 @@ std::unique_ptr make_sparse_solver(DataType dt, CuSparseSolver::~CuSparseSolver() { #if defined(TI_WITH_CUDA) - if (h_Q != nullptr) - free(h_Q); - if (h_csrRowPtrB != nullptr) - free(h_csrRowPtrB); - if (h_csrColIndB != nullptr) - free(h_csrColIndB); - if (h_csrValB != nullptr) - free(h_csrValB); - if (h_mapBfromA != nullptr) - free(h_mapBfromA); + if (h_Q_ != nullptr) + free(h_Q_); + if (h_csrRowPtrB_ != nullptr) + free(h_csrRowPtrB_); + if (h_csrColIndB_ != nullptr) + free(h_csrColIndB_); + if (h_csrValB_ != nullptr) + free(h_csrValB_); + if (h_mapBfromA_ != nullptr) + free(h_mapBfromA_); + if (cpu_buffer_ != nullptr) + free(cpu_buffer_); if (info_ != nullptr) CUSOLVERDriver::get_instance().csSpDestroyCsrcholInfo(info_); + if (lu_info_ != nullptr) + CUSOLVERDriver::get_instance().csSpDestroyCsrluInfoHost(lu_info_); if (cusolver_handle_ != nullptr) CUSOLVERDriver::get_instance().csSpDestory(cusolver_handle_); if (cusparse_handel_ != nullptr) @@ -599,20 +603,27 @@ CuSparseSolver::~CuSparseSolver() { CUSPARSEDriver::get_instance().cpDestroyMatDescr(descr_); if (gpu_buffer_ != nullptr) CUDADriver::get_instance().mem_free(gpu_buffer_); - if (d_Q != nullptr) - CUDADriver::get_instance().mem_free(d_Q); - if (d_csrRowPtrB != nullptr) - CUDADriver::get_instance().mem_free(d_csrRowPtrB); - if (d_csrColIndB != nullptr) - CUDADriver::get_instance().mem_free(d_csrColIndB); - if (d_csrValB != nullptr) - CUDADriver::get_instance().mem_free(d_csrValB); + if (d_Q_ != nullptr) + CUDADriver::get_instance().mem_free(d_Q_); + if (d_csrRowPtrB_ != nullptr) + CUDADriver::get_instance().mem_free(d_csrRowPtrB_); + if (d_csrColIndB_ != nullptr) + CUDADriver::get_instance().mem_free(d_csrColIndB_); + if (d_csrValB_ != nullptr) + CUDADriver::get_instance().mem_free(d_csrValB_); #endif } std::unique_ptr make_cusparse_solver( DataType dt, const std::string &solver_type, const std::string &ordering) { - return std::make_unique(); + if (solver_type == "LLT" || solver_type == "LDLT") { + return std::make_unique( + CuSparseSolver::SolverType::Cholesky); + } else if (solver_type == "LU") { + return std::make_unique(CuSparseSolver::SolverType::LU); + } else { + TI_ERROR("Not supported sparse solver type: {}", solver_type); + } } } // namespace taichi::lang diff --git a/taichi/program/sparse_solver.h b/taichi/program/sparse_solver.h index 0b4e930dfc6d6..2ee17992a4e45 100644 --- a/taichi/program/sparse_solver.h +++ b/taichi/program/sparse_solver.h @@ -76,26 +76,36 @@ DECLARE_EIGEN_LU_SOLVER(float64, LU, AMD); DECLARE_EIGEN_LU_SOLVER(float64, LU, COLAMD); class CuSparseSolver : public SparseSolver { + public: + enum class SolverType { Cholesky, LU }; + private: + SolverType solver_type_{SolverType::Cholesky}; csrcholInfo_t info_{nullptr}; + csrluInfoHost_t lu_info_{nullptr}; cusolverSpHandle_t cusolver_handle_{nullptr}; cusparseHandle_t cusparse_handel_{nullptr}; cusparseMatDescr_t descr_{nullptr}; void *gpu_buffer_{nullptr}; + void *cpu_buffer_{nullptr}; bool is_analyzed_{false}; bool is_factorized_{false}; - int *h_Q{nullptr}; /* n, B = Q*A*Q' or B = A(Q,Q) by MATLAB notation */ - int *d_Q{nullptr}; - int *h_csrRowPtrB{nullptr}; /* n+1 */ - int *h_csrColIndB{nullptr}; /* nnzA */ - float *h_csrValB{nullptr}; /* nnzA */ - int *h_mapBfromA{nullptr}; /* nnzA */ - int *d_csrRowPtrB{nullptr}; /* n+1 */ - int *d_csrColIndB{nullptr}; /* nnzA */ - float *d_csrValB{nullptr}; /* nnzA */ + int *h_Q_{ + nullptr}; /* n, B = Q*A*Q' or B = A(Q,Q) by MATLAB notation */ + int *d_Q_{nullptr}; + int *h_csrRowPtrB_{nullptr}; /* n+1 */ + int *h_csrColIndB_{nullptr}; /* nnzA */ + float *h_csrValB_{nullptr}; /* nnzA */ + int *h_mapBfromA_{nullptr}; /* nnzA */ + int *d_csrRowPtrB_{nullptr}; /* n+1 */ + int *d_csrColIndB_{nullptr}; /* nnzA */ + float *d_csrValB_{nullptr}; /* nnzA */ public: CuSparseSolver(); + explicit CuSparseSolver(SolverType solver_type) : solver_type_(solver_type) { + init_solver(); + } ~CuSparseSolver() override; bool compute(const SparseMatrix &sm) override { TI_NOT_IMPLEMENTED; @@ -103,17 +113,30 @@ class CuSparseSolver : public SparseSolver { void analyze_pattern(const SparseMatrix &sm) override; void factorize(const SparseMatrix &sm) override; - void solve_cu(Program *prog, - const SparseMatrix &sm, - const Ndarray &b, - const Ndarray &x); void solve_rf(Program *prog, const SparseMatrix &sm, const Ndarray &b, const Ndarray &x); + bool info() override { TI_NOT_IMPLEMENTED; }; + + private: + void init_solver(); + void reorder(const CuSparseMatrix &sm); + void analyze_pattern_cholesky(const SparseMatrix &sm); + void analyze_pattern_lu(const SparseMatrix &sm); + void factorize_cholesky(const SparseMatrix &sm); + void factorize_lu(const SparseMatrix &sm); + void solve_cholesky(Program *prog, + const SparseMatrix &sm, + const Ndarray &b, + const Ndarray &x); + void solve_lu(Program *prog, + const SparseMatrix &sm, + const Ndarray &b, + const Ndarray &x); }; std::unique_ptr make_sparse_solver(DataType dt, diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index a34fd12dd2e2b..17980a8f3cd93 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -1185,7 +1185,8 @@ void export_lang(py::module &m) { // Sparse Matrix py::class_(m, "SparseMatrixBuilder") - .def("print_triplets", &SparseMatrixBuilder::print_triplets) + .def("print_triplets_eigen", &SparseMatrixBuilder::print_triplets_eigen) + .def("print_triplets_cuda", &SparseMatrixBuilder::print_triplets_cuda) .def("get_ndarray_data_ptr", &SparseMatrixBuilder::get_ndarray_data_ptr) .def("build", &SparseMatrixBuilder::build) .def("build_cuda", &SparseMatrixBuilder::build_cuda) @@ -1288,7 +1289,6 @@ void export_lang(py::module &m) { .def("analyze_pattern", &CuSparseSolver::analyze_pattern) .def("factorize", &CuSparseSolver::factorize) .def("solve_rf", &CuSparseSolver::solve_rf) - .def("solve_cu", &CuSparseSolver::solve_cu) .def("info", &CuSparseSolver::info); m.def("make_sparse_solver", &make_sparse_solver); diff --git a/taichi/rhi/cuda/cuda_types.h b/taichi/rhi/cuda/cuda_types.h index e731ece7929de..c45748f572160 100644 --- a/taichi/rhi/cuda/cuda_types.h +++ b/taichi/rhi/cuda/cuda_types.h @@ -561,4 +561,6 @@ struct csrcholInfoHost; typedef struct csrcholInfoHost *csrcholInfoHost_t; struct csrcholInfo; typedef struct csrcholInfo *csrcholInfo_t; +struct csrluInfoHost; +typedef struct csrluInfoHost *csrluInfoHost_t; #endif diff --git a/taichi/rhi/cuda/cusolver_functions.inc.h b/taichi/rhi/cuda/cusolver_functions.inc.h index b57025a1afe2b..ddd6d00a8fda1 100644 --- a/taichi/rhi/cuda/cusolver_functions.inc.h +++ b/taichi/rhi/cuda/cusolver_functions.inc.h @@ -14,7 +14,7 @@ PER_CUSOLVER_FUNCTION(csSpXcsrperm_bufferSizeHost, cusolverSpXcsrperm_bufferSize PER_CUSOLVER_FUNCTION(csSpXcsrpermHost, cusolverSpXcsrpermHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,void *,void *); PER_CUSOLVER_FUNCTION(csSpScsrlsvcholHost, cusolverSpScsrlsvcholHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,const void *,const void *,float ,int ,void *,void *); -// cusolver preview API +// cusolver preview API for cholesky PER_CUSOLVER_FUNCTION(csSpCreateCsrcholInfo, cusolverSpCreateCsrcholInfo, csrcholInfo_t*); PER_CUSOLVER_FUNCTION(csSpDestroyCsrcholInfo, cusolverSpDestroyCsrcholInfo, csrcholInfo_t); PER_CUSOLVER_FUNCTION(csSpXcsrcholAnalysis, cusolverSpXcsrcholAnalysis, cusolverSpHandle_t,int ,int ,const cusparseMatDescr_t , void *, void *,csrcholInfo_t ); @@ -22,3 +22,12 @@ PER_CUSOLVER_FUNCTION(csSpScsrcholBufferInfo, cusolverSpScsrcholBufferInfo, cus PER_CUSOLVER_FUNCTION(csSpScsrcholFactor, cusolverSpScsrcholFactor, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,void *); PER_CUSOLVER_FUNCTION(csSpScsrcholZeroPivot, cusolverSpScsrcholZeroPivot, cusolverSpHandle_t, csrcholInfo_t ,float ,void *); PER_CUSOLVER_FUNCTION(csSpScsrcholSolve, cusolverSpScsrcholSolve, cusolverSpHandle_t ,int ,void *,void *,csrcholInfo_t ,void *); + +// cusolver preview API for LU +PER_CUSOLVER_FUNCTION(csSpCreateCsrluInfoHost, cusolverSpCreateCsrluInfoHost, csrluInfoHost_t*); +PER_CUSOLVER_FUNCTION(csSpDestroyCsrluInfoHost, cusolverSpDestroyCsrluInfoHost, csrluInfoHost_t); +PER_CUSOLVER_FUNCTION(csSpXcsrluAnalysisHost, cusolverSpXcsrluAnalysisHost, cusolverSpHandle_t,int ,int ,const cusparseMatDescr_t , void *, void *,csrluInfoHost_t ); +PER_CUSOLVER_FUNCTION(csSpScsrluBufferInfoHost, cusolverSpScsrluBufferInfoHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrluInfoHost_t ,size_t *,size_t *); +PER_CUSOLVER_FUNCTION(csSpScsrluFactorHost, cusolverSpScsrluFactorHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrluInfoHost_t,float, void *); +PER_CUSOLVER_FUNCTION(csSpScsrluZeroPivotHost, cusolverSpScsrluZeroPivotHost, cusolverSpHandle_t, csrluInfoHost_t ,float ,void *); +PER_CUSOLVER_FUNCTION(csSpScsrluSolveHost, cusolverSpScsrluSolveHost, cusolverSpHandle_t ,int ,void *,void *,csrluInfoHost_t ,void *); diff --git a/tests/python/test_sparse_linear_solver.py b/tests/python/test_sparse_linear_solver.py index 7408c8da1981c..14a235740c6ca 100644 --- a/tests/python/test_sparse_linear_solver.py +++ b/tests/python/test_sparse_linear_solver.py @@ -116,14 +116,10 @@ def fill(A_builder: ti.types.sparse_matrix_builder(), fill(A_builder, A_coo.row, A_coo.col, A_coo.data) A_ti = A_builder.build() x_ti = ti.ndarray(shape=ncols, dtype=ti.float32) - solver = ti.linalg.SparseSolver() - x_ti = solver.solve_cu(A_ti, b) - ti.sync() # solve Ax=b using numpy b_np = b.to_numpy() x_np = np.linalg.solve(A_psd, b_np) - assert (np.allclose(x_ti.to_numpy(), x_np, rtol=5.0e-3)) # solve Ax=b using cusolver refectorization solver = ti.linalg.SparseSolver() @@ -142,8 +138,9 @@ def fill(A_builder: ti.types.sparse_matrix_builder(), @pytest.mark.parametrize("dtype", [ti.f32]) +@pytest.mark.parametrize("solver_type", ["LLT", "LU"]) @test_utils.test(arch=ti.cuda) -def test_gpu_sparse_solver2(dtype): +def test_gpu_sparse_solver2(dtype, solver_type): n = 10 A = np.random.rand(n, n) A_psd = np.dot(A, A.transpose()) @@ -160,7 +157,7 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(), fill(Abuilder, A_psd, b) A = Abuilder.build() - solver = ti.linalg.SparseSolver(dtype=dtype) + solver = ti.linalg.SparseSolver(dtype=dtype, solver_type=solver_type) solver.analyze_pattern(A) solver.factorize(A) x = solver.solve(b)