ginkgo-project · upsj · Jul 19, 2022 · Jul 10, 2022 · Jul 19, 2022 · Jul 19, 2022
diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp
@@ -134,9 +134,9 @@ template <typename ValueType, typename IndexType>
 void LowerTrs<ValueType, IndexType>::generate()
 {
     if (this->get_system_matrix()) {
-        this->get_executor()->run(
-            lower_trs::make_generate(this->get_system_matrix().get(),
-                                     this->solve_struct_, parameters_.num_rhs));
+        this->get_executor()->run(lower_trs::make_generate(
+            this->get_system_matrix().get(), this->solve_struct_,
+            this->get_parameters().unit_diagonal, parameters_.num_rhs));
     }
 }
 
@@ -176,9 +176,10 @@ void LowerTrs<ValueType, IndexType>::apply_impl(const LinOp* b, LinOp* x) const
                 trans_x = this->template create_workspace_op<Vector>(
                     ws::transposed_x, gko::transpose(dense_x->get_size()));
             }
-            exec->run(lower_trs::make_solve(lend(this->get_system_matrix()),
-                                            lend(this->solve_struct_), trans_b,
-                                            trans_x, dense_b, dense_x));
+            exec->run(lower_trs::make_solve(
+                lend(this->get_system_matrix()), lend(this->solve_struct_),
+                this->get_parameters().unit_diagonal, trans_b, trans_x, dense_b,
+                dense_x));
         },
         b, x);
 }

diff --git a/core/solver/lower_trs_kernels.hpp b/core/solver/lower_trs_kernels.hpp
@@ -62,13 +62,13 @@ namespace lower_trs {
     void generate(std::shared_ptr<const DefaultExecutor> exec,        \
                   const matrix::Csr<_vtype, _itype>* matrix,          \
                   std::shared_ptr<solver::SolveStruct>& solve_struct, \
-                  const gko::size_type num_rhs)
+                  bool unit_diag, const gko::size_type num_rhs)
 
 
 #define GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL(_vtype, _itype)                     \
     void solve(std::shared_ptr<const DefaultExecutor> exec,                    \
                const matrix::Csr<_vtype, _itype>* matrix,                      \
-               const solver::SolveStruct* solve_struct,                        \
+               const solver::SolveStruct* solve_struct, bool unit_diag,        \
                matrix::Dense<_vtype>* trans_b, matrix::Dense<_vtype>* trans_x, \
                const matrix::Dense<_vtype>* b, matrix::Dense<_vtype>* x)
 

diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp
@@ -134,9 +134,9 @@ template <typename ValueType, typename IndexType>
 void UpperTrs<ValueType, IndexType>::generate()
 {
     if (this->get_system_matrix()) {
-        this->get_executor()->run(
-            upper_trs::make_generate(this->get_system_matrix().get(),
-                                     this->solve_struct_, parameters_.num_rhs));
+        this->get_executor()->run(upper_trs::make_generate(
+            this->get_system_matrix().get(), this->solve_struct_,
+            this->get_parameters().unit_diagonal, parameters_.num_rhs));
     }
 }
 
@@ -176,9 +176,10 @@ void UpperTrs<ValueType, IndexType>::apply_impl(const LinOp* b, LinOp* x) const
                 trans_x = this->template create_workspace_op<Vector>(
                     ws::transposed_x, gko::transpose(dense_x->get_size()));
             }
-            exec->run(upper_trs::make_solve(lend(this->get_system_matrix()),
-                                            lend(this->solve_struct_), trans_b,
-                                            trans_x, dense_b, dense_x));
+            exec->run(upper_trs::make_solve(
+                lend(this->get_system_matrix()), lend(this->solve_struct_),
+                this->get_parameters().unit_diagonal, trans_b, trans_x, dense_b,
+                dense_x));
         },
         b, x);
 }

diff --git a/core/solver/upper_trs_kernels.hpp b/core/solver/upper_trs_kernels.hpp
@@ -62,13 +62,13 @@ namespace upper_trs {
     void generate(std::shared_ptr<const DefaultExecutor> exec,             \
                   const matrix::Csr<_vtype, _itype>* matrix,               \
                   std::shared_ptr<gko::solver::SolveStruct>& solve_struct, \
-                  const gko::size_type num_rhs)
+                  bool unit_diag, const gko::size_type num_rhs)
 
 
 #define GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL(_vtype, _itype)                     \
     void solve(std::shared_ptr<const DefaultExecutor> exec,                    \
                const matrix::Csr<_vtype, _itype>* matrix,                      \
-               const solver::SolveStruct* solve_struct,                        \
+               const solver::SolveStruct* solve_struct, bool unit_diag,        \
                matrix::Dense<_vtype>* trans_b, matrix::Dense<_vtype>* trans_x, \
                const matrix::Dense<_vtype>* b, matrix::Dense<_vtype>* x)
 

diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp
@@ -790,6 +790,13 @@ inline void set_mat_fill_mode(cusparseMatDescr_t descr,
 }
 
 
+inline void set_mat_diag_type(cusparseMatDescr_t descr,
+                              cusparseDiagType_t diag_type)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSetMatDiagType(descr, diag_type));
+}
+
+
 inline void destroy(cusparseMatDescr_t descr)
 {
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyMatDescr(descr));

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
@@ -89,7 +89,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
 
     CudaSolveStruct(std::shared_ptr<const gko::CudaExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* matrix,
-                    size_type num_rhs, bool is_upper)
+                    size_type num_rhs, bool is_upper, bool unit_diag)
         : handle{exec->get_cusparse_handle()},
           spsm_descr{},
           descr_a{},
@@ -107,7 +107,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
             descr_a, CUSPARSE_SPMAT_FILL_MODE,
             is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER);
         cusparse::set_attribute<cusparseDiagType_t>(
-            descr_a, CUSPARSE_SPMAT_DIAG_TYPE, CUSPARSE_DIAG_TYPE_NON_UNIT);
+            descr_a, CUSPARSE_SPMAT_DIAG_TYPE,
+            unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT);
 
         const auto rows = matrix->get_size()[0];
         // workaround suggested by NVIDIA engineers: for some reason
@@ -193,7 +194,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
 
     CudaSolveStruct(std::shared_ptr<const gko::CudaExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* matrix,
-                    size_type num_rhs, bool is_upper)
+                    size_type num_rhs, bool is_upper, bool unit_diag)
         : exec{exec},
           handle{exec->get_cusparse_handle()},
           algorithm{},
@@ -208,6 +209,9 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         cusparse::set_mat_fill_mode(
             factor_descr,
             is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER);
+        cusparse::set_mat_diag_type(
+            factor_descr,
+            unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT);
         algorithm = 0;
         policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
 
@@ -286,14 +290,15 @@ template <typename ValueType, typename IndexType>
 void generate_kernel(std::shared_ptr<const CudaExecutor> exec,
                      const matrix::Csr<ValueType, IndexType>* matrix,
                      std::shared_ptr<solver::SolveStruct>& solve_struct,
-                     const gko::size_type num_rhs, bool is_upper)
+                     const gko::size_type num_rhs, bool is_upper,
+                     bool unit_diag)
 {
     if (matrix->get_size()[0] == 0) {
         return;
     }
     if (cusparse::is_supported<ValueType, IndexType>::value) {
         solve_struct = std::make_shared<CudaSolveStruct<ValueType, IndexType>>(
-            exec, matrix, num_rhs, is_upper);
+            exec, matrix, num_rhs, is_upper, unit_diag);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -377,7 +382,8 @@ __global__ void sptrsv_naive_caching_kernel(
     const IndexType* const rowptrs, const IndexType* const colidxs,
     const ValueType* const vals, const ValueType* const b, size_type b_stride,
     ValueType* const x, size_type x_stride, const size_type n,
-    const size_type nrhs, bool* nan_produced, IndexType* atomic_counter)
+    const size_type nrhs, bool unit_diag, bool* nan_produced,
+    IndexType* atomic_counter)
 {
     __shared__ uninitialized_array<ValueType, default_block_size> x_s_array;
     __shared__ IndexType block_base_idx;
@@ -409,12 +415,16 @@ __global__ void sptrsv_naive_caching_kernel(
     // upper tri matrix: start at last entry (row_end - 1), run backward
     // until first entry, which is the diagonal entry
     const auto row_begin = is_upper ? rowptrs[row + 1] - 1 : rowptrs[row];
-    const auto row_diag = is_upper ? rowptrs[row] : rowptrs[row + 1] - 1;
+    const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1];
     const int row_step = is_upper ? -1 : 1;
 
-    ValueType sum = 0.0;
-    for (auto i = row_begin; i != row_diag; i += row_step) {
+    auto sum = zero<ValueType>();
+    auto i = row_begin;
+    for (; i != row_end; i += row_step) {
         const auto dependency = colidxs[i];
+        if (is_upper ? dependency <= row : dependency >= row) {
+            break;
+        }
         auto x_p = &x[dependency * x_stride + rhs];
 
         const auto dependency_gid = is_upper ? (n - 1 - dependency) * nrhs + rhs
@@ -434,7 +444,9 @@ __global__ void sptrsv_naive_caching_kernel(
         sum += x * vals[i];
     }
 
-    const auto r = (b[row * b_stride + rhs] - sum) / vals[row_diag];
+    // The first entry past the triangular part will be the diagonal
+    const auto diag = unit_diag ? one<ValueType>() : vals[i];
+    const auto r = (b[row * b_stride + rhs] - sum) / diag;
 
     store(x_s, self_shid, r);
     x[row * x_stride + rhs] = r;
@@ -453,7 +465,8 @@ __global__ void sptrsv_naive_legacy_kernel(
     const IndexType* const rowptrs, const IndexType* const colidxs,
     const ValueType* const vals, const ValueType* const b, size_type b_stride,
     ValueType* const x, size_type x_stride, const size_type n,
-    const size_type nrhs, bool* nan_produced, IndexType* atomic_counter)
+    const size_type nrhs, bool unit_diag, bool* nan_produced,
+    IndexType* atomic_counter)
 {
     __shared__ IndexType block_base_idx;
     if (threadIdx.x == 0) {
@@ -470,29 +483,35 @@ __global__ void sptrsv_naive_legacy_kernel(
         return;
     }
 
-    // lower tri matrix: start at beginning, run forward until last entry,
-    // (row_end - 1) which is the diagonal entry
+    // lower tri matrix: start at beginning, run forward
     // upper tri matrix: start at last entry (row_end - 1), run backward
-    // until first entry, which is the diagonal entry
     const auto row_begin = is_upper ? rowptrs[row + 1] - 1 : rowptrs[row];
-    const auto row_diag = is_upper ? rowptrs[row] : rowptrs[row + 1] - 1;
+    const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1];
     const int row_step = is_upper ? -1 : 1;
 
     ValueType sum = 0.0;
     auto j = row_begin;
-    while (j != row_diag + row_step) {
-        auto col = colidxs[j];
+    auto col = colidxs[j];
+    while (j != row_end) {
         auto x_val = load(x, col * x_stride + rhs);
         while (!is_nan(x_val)) {
             sum += vals[j] * x_val;
             j += row_step;
             col = colidxs[j];
             x_val = load(x, col * x_stride + rhs);
         }
-        if (row == col) {
-            const auto r = (b[row * b_stride + rhs] - sum) / vals[row_diag];
+        // to avoid the kernel hanging on matrices without diagonal,
+        // we bail out if we are past the triangle, even if it's not
+        // the diagonal entry. This may lead to incorrect results,
+        // but prevents an infinite loop.
+        if (is_upper ? row >= col : row <= col) {
+            // assert(row == col);
+            auto diag = unit_diag ? one<ValueType>() : vals[j];
+            const auto r = (b[row * b_stride + rhs] - sum) / diag;
             store(x, row * x_stride + rhs, r);
-            j += row_step;
+            // after we encountered the diagonal, we are done
+            // this also skips entries outside the triangle
+            j = row_end;
             if (is_nan(r)) {
                 store(x, row * x_stride + rhs, zero<ValueType>());
                 *nan_produced = true;
@@ -514,7 +533,7 @@ __global__ void sptrsv_init_kernel(bool* const nan_produced,
 template <bool is_upper, typename ValueType, typename IndexType>
 void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
                           const matrix::Csr<ValueType, IndexType>* matrix,
-                          const matrix::Dense<ValueType>* b,
+                          bool unit_diag, const matrix::Dense<ValueType>* b,
                           matrix::Dense<ValueType>* x)
 {
     // Pre-Volta GPUs may deadlock due to missing independent thread scheduling.
@@ -540,14 +559,14 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
             matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
             as_cuda_type(matrix->get_const_values()),
             as_cuda_type(b->get_const_values()), b->get_stride(),
-            as_cuda_type(x->get_values()), x->get_stride(), n, nrhs,
+            as_cuda_type(x->get_values()), x->get_stride(), n, nrhs, unit_diag,
             nan_produced.get_data(), atomic_counter.get_data());
     } else {
         sptrsv_naive_caching_kernel<is_upper><<<grid_size, block_size>>>(
             matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
             as_cuda_type(matrix->get_const_values()),
             as_cuda_type(b->get_const_values()), b->get_stride(),
-            as_cuda_type(x->get_values()), x->get_stride(), n, nrhs,
+            as_cuda_type(x->get_values()), x->get_stride(), n, nrhs, unit_diag,
             nan_produced.get_data(), atomic_counter.get_data());
     }
 

diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
@@ -73,11 +73,11 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const CudaExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              const gko::size_type num_rhs)
+              bool unit_diag, const gko::size_type num_rhs)
 {
     if (matrix->get_strategy()->get_name() == "sparselib") {
         generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct,
-                                              num_rhs, false);
+                                              num_rhs, false, unit_diag);
     }
 }
 
@@ -88,15 +88,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void solve(std::shared_ptr<const CudaExecutor> exec,
            const matrix::Csr<ValueType, IndexType>* matrix,
-           const solver::SolveStruct* solve_struct,
+           const solver::SolveStruct* solve_struct, bool unit_diag,
            matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
            const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* x)
 {
     if (matrix->get_strategy()->get_name() == "sparselib") {
         solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
                                            trans_x, b, x);
     } else {
-        sptrsv_naive_caching<false>(exec, matrix, b, x);
+        sptrsv_naive_caching<false>(exec, matrix, unit_diag, b, x);
     }
 }
 

diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu
@@ -73,11 +73,11 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const CudaExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              const gko::size_type num_rhs)
+              bool unit_diag, const gko::size_type num_rhs)
 {
     if (matrix->get_strategy()->get_name() == "sparselib") {
         generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct,
-                                              num_rhs, true);
+                                              num_rhs, true, unit_diag);
     }
 }
 
@@ -88,15 +88,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void solve(std::shared_ptr<const CudaExecutor> exec,
            const matrix::Csr<ValueType, IndexType>* matrix,
-           const solver::SolveStruct* solve_struct,
+           const solver::SolveStruct* solve_struct, bool unit_diag,
            matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
            const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* x)
 {
     if (matrix->get_strategy()->get_name() == "sparselib") {
         solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
                                            trans_x, b, x);
     } else {
-        sptrsv_naive_caching<true>(exec, matrix, b, x);
+        sptrsv_naive_caching<true>(exec, matrix, unit_diag, b, x);
     }
 }
 

diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config
@@ -34,6 +34,11 @@
     - FixInclude: "common/unified/base/kernel_launch_solver.hpp"
 - "test/base/kernel_launch_generic.cpp"
     - FixInclude: "common/unified/base/kernel_launch.hpp"
+- "^test/solver/(lower|upper)_trs_kernels.cpp"
+    - CoreSuffix: "_kernels"
+    - PathPrefix: "ginkgo/core"
+    - PathIgnore: "0"
+    - RemoveTest: "true"
 - "elimination_forest\.cpp"
     - FixInclude: "core/factorization/elimination_forest.hpp"
 - "common/unified/.*.cpp"

diff --git a/dpcpp/solver/lower_trs_kernels.dp.cpp b/dpcpp/solver/lower_trs_kernels.dp.cpp
@@ -70,7 +70,7 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const DpcppExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
+              bool unit_diag, const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
@@ -83,7 +83,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void solve(std::shared_ptr<const DpcppExecutor> exec,
            const matrix::Csr<ValueType, IndexType>* matrix,
-           const solver::SolveStruct* solve_struct,
+           const solver::SolveStruct* solve_struct, bool unit_diag,
            matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
            const matrix::Dense<ValueType>* b,
            matrix::Dense<ValueType>* x) GKO_NOT_IMPLEMENTED;

diff --git a/dpcpp/solver/upper_trs_kernels.dp.cpp b/dpcpp/solver/upper_trs_kernels.dp.cpp
@@ -70,7 +70,7 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const DpcppExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
+              bool unit_diag, const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
@@ -83,7 +83,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void solve(std::shared_ptr<const DpcppExecutor> exec,
            const matrix::Csr<ValueType, IndexType>* matrix,
-           const solver::SolveStruct* solve_struct,
+           const solver::SolveStruct* solve_struct, bool unit_diag,
            matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
            const matrix::Dense<ValueType>* b,
            matrix::Dense<ValueType>* x) GKO_NOT_IMPLEMENTED;