Merge Add SparsityCsr spmv with mixed precision support

This PR adds SparsityCsr spmv copied from classical csr spmv with mixed precision. It is mainly used by amgx_pgm, so it only use 2 subwarp_size for cuda/hip and 1 subgroup_size for dpcpp Related PR: #970
ginkgo-project · Feb 16, 2022 · 7048448 · 7048448
2 parents 9d2ad37 + ff428ce
commit 7048448
Show file tree

Hide file tree

Showing 19 changed files with 790 additions and 453 deletions.
diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
@@ -0,0 +1,110 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <size_type subwarp_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType,
+          typename Closure>
+__device__ void device_classical_spmv(const size_type num_rows,
+                                      const MatrixValueType* __restrict__ val,
+                                      const IndexType* __restrict__ col_idxs,
+                                      const IndexType* __restrict__ row_ptrs,
+                                      acc::range<input_accessor> b,
+                                      acc::range<output_accessor> c,
+                                      Closure scale)
+{
+    using arithmetic_type = typename output_accessor::arithmetic_type;
+    auto subwarp_tile =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    const auto subrow = thread::get_subwarp_num_flat<subwarp_size>();
+    const auto subid = subwarp_tile.thread_rank();
+    const IndexType column_id = blockIdx.y;
+    const arithmetic_type value = val[0];
+    auto row = thread::get_subwarp_id_flat<subwarp_size>();
+    for (; row < num_rows; row += subrow) {
+        const auto ind_end = row_ptrs[row + 1];
+        arithmetic_type temp_val = zero<arithmetic_type>();
+        for (auto ind = row_ptrs[row] + subid; ind < ind_end;
+             ind += subwarp_size) {
+            temp_val += value * b(col_idxs[ind], column_id);
+        }
+        auto subwarp_result =
+            reduce(subwarp_tile, temp_val,
+                   [](const arithmetic_type& a, const arithmetic_type& b) {
+                       return a + b;
+                   });
+        if (subid == 0) {
+            c(row, column_id) = scale(subwarp_result, c(row, column_id));
+        }
+    }
+}
+
+
+template <size_type subwarp_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
+    const size_type num_rows, const MatrixValueType* __restrict__ val,
+    const IndexType* __restrict__ col_idxs,
+    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
+    acc::range<output_accessor> c)
+{
+    using type = typename output_accessor::arithmetic_type;
+    device_classical_spmv<subwarp_size>(
+        num_rows, val, col_idxs, row_ptrs, b, c,
+        [](const type& x, const type& y) { return x; });
+}
+
+
+template <size_type subwarp_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
+    const size_type num_rows, const MatrixValueType* __restrict__ alpha,
+    const MatrixValueType* __restrict__ val,
+    const IndexType* __restrict__ col_idxs,
+    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
+    const typename output_accessor::storage_type* __restrict__ beta,
+    acc::range<output_accessor> c)
+{
+    using type = typename output_accessor::arithmetic_type;
+    const type alpha_val = alpha[0];
+    const type beta_val = beta[0];
+    device_classical_spmv<subwarp_size>(
+        num_rows, val, col_idxs, row_ptrs, b, c,
+        [&alpha_val, &beta_val](const type& x, const type& y) {
+            return alpha_val * x + beta_val * y;
+        });
+}
+
+
+}  // namespace kernel
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -452,8 +452,9 @@ GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 namespace sparsity_csr {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_COUNT_NUM_DIAGONAL_ELEMENTS_KERNEL);

diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp
@@ -76,7 +76,7 @@ template <typename ValueType, typename IndexType>
 void SparsityCsr<ValueType, IndexType>::apply_impl(const LinOp* b,
                                                    LinOp* x) const
 {
-    precision_dispatch_real_complex<ValueType>(
+    mixed_precision_dispatch_real_complex<ValueType>(
         [this](auto dense_b, auto dense_x) {
             this->get_executor()->run(
                 sparsity_csr::make_spmv(this, dense_b, dense_x));
@@ -91,12 +91,15 @@ void SparsityCsr<ValueType, IndexType>::apply_impl(const LinOp* alpha,
                                                    const LinOp* beta,
                                                    LinOp* x) const
 {
-    precision_dispatch_real_complex<ValueType>(
-        [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
+    mixed_precision_dispatch_real_complex<ValueType>(
+        [this, alpha, beta](auto dense_b, auto dense_x) {
+            auto dense_alpha = make_temporary_conversion<ValueType>(alpha);
+            auto dense_beta = make_temporary_conversion<
+                typename std::decay_t<decltype(*dense_x)>::value_type>(beta);
             this->get_executor()->run(sparsity_csr::make_advanced_spmv(
-                dense_alpha, this, dense_b, dense_beta, dense_x));
+                dense_alpha.get(), this, dense_b, dense_beta.get(), dense_x));
         },
-        alpha, b, beta, x);
+        b, x);
 }
 
 

diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp
@@ -48,18 +48,22 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(ValueType, IndexType) \
-    void spmv(std::shared_ptr<const DefaultExecutor> exec,         \
-              const matrix::SparsityCsr<ValueType, IndexType>* a,  \
-              const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)
-
-#define GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType) \
-    void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,         \
-                       const matrix::Dense<ValueType>* alpha,               \
-                       const matrix::SparsityCsr<ValueType, IndexType>* a,  \
-                       const matrix::Dense<ValueType>* b,                   \
-                       const matrix::Dense<ValueType>* beta,                \
-                       matrix::Dense<ValueType>* c)
+#define GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(MatrixValueType, InputValueType, \
+                                             OutputValueType, IndexType)      \
+    void spmv(std::shared_ptr<const DefaultExecutor> exec,                    \
+              const matrix::SparsityCsr<MatrixValueType, IndexType>* a,       \
+              const matrix::Dense<InputValueType>* b,                         \
+              matrix::Dense<OutputValueType>* c)
+
+#define GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(            \
+    MatrixValueType, InputValueType, OutputValueType, IndexType)  \
+    void advanced_spmv(                                           \
+        std::shared_ptr<const DefaultExecutor> exec,              \
+        const matrix::Dense<MatrixValueType>* alpha,              \
+        const matrix::SparsityCsr<MatrixValueType, IndexType>* a, \
+        const matrix::Dense<InputValueType>* b,                   \
+        const matrix::Dense<OutputValueType>* beta,               \
+        matrix::Dense<OutputValueType>* c)
 
 #define GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType)    \
     void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,            \
@@ -98,10 +102,14 @@ namespace kernels {
         bool* is_sorted)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                        \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(ValueType, IndexType);             \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType);    \
+    template <typename MatrixValueType, typename InputValueType,            \
+              typename OutputValueType, typename IndexType>                 \
+    GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(MatrixValueType, InputValueType,   \
+                                         OutputValueType, IndexType);       \
+    template <typename MatrixValueType, typename InputValueType,            \
+              typename OutputValueType, typename IndexType>                 \
+    GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(                          \
+        MatrixValueType, InputValueType, OutputValueType, IndexType);       \
     template <typename ValueType, typename IndexType>                       \
     GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType);    \
     template <typename ValueType, typename IndexType>                       \

diff --git a/core/multigrid/amgx_pgm.cpp b/core/multigrid/amgx_pgm.cpp
@@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/matrix/row_gatherer.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
 #include "core/base/utils.hpp"
@@ -161,6 +162,16 @@ void AmgxPgm<ValueType, IndexType>::generate()
                                         one<ValueType>()));
     // TODO: implement the restrict_op from aggregation.
     auto restrict_op = gko::as<csr_type>(share(prolong_csr->transpose()));
+    auto restrict_sparsity =
+        share(matrix::SparsityCsr<ValueType, IndexType>::create(
+            exec, restrict_op->get_size(),
+            restrict_op->get_num_stored_elements()));
+    exec->copy_from(exec.get(), static_cast<size_type>(num_agg + 1),
+                    restrict_op->get_const_row_ptrs(),
+                    restrict_sparsity->get_row_ptrs());
+    exec->copy_from(exec.get(), restrict_op->get_num_stored_elements(),
+                    restrict_op->get_const_col_idxs(),
+                    restrict_sparsity->get_col_idxs());
 
     // Construct the coarse matrix
     // TODO: use less memory footprint to improve it
@@ -170,7 +181,8 @@ void AmgxPgm<ValueType, IndexType>::generate()
     amgxpgm_op->apply(prolong_csr.get(), tmp.get());
     restrict_op->apply(tmp.get(), coarse_matrix.get());
 
-    this->set_multigrid_level(prolong_row_gather, coarse_matrix, restrict_op);
+    this->set_multigrid_level(prolong_row_gather, coarse_matrix,
+                              restrict_sparsity);
 }
 
 

diff --git a/core/test/matrix/fbcsr_sample.hpp b/core/test/matrix/fbcsr_sample.hpp
@@ -114,22 +114,25 @@ class FbcsrSample {
                 static_cast<gko::acc::size_type>(bs)},
             v);
 
-        if (mtx->get_size()[0] % bs != 0)
+        if (mtx->get_size()[0] % bs != 0) {
             throw gko::BadDimension(__FILE__, __LINE__, __func__, "test fbcsr",
                                     mtx->get_size()[0], mtx->get_size()[1],
                                     "block size does not divide the size!");
+        }
 
         for (index_type ibrow = 0; ibrow < mtx->get_num_block_rows(); ibrow++) {
             const index_type* const browptr = mtx->get_row_ptrs();
             for (index_type inz = browptr[ibrow]; inz < browptr[ibrow + 1];
                  inz++) {
                 const index_type bcolind = mtx->get_col_idxs()[inz];
                 const value_type base = (ibrow + 1) * (bcolind + 1);
-                for (int ival = 0; ival < bs; ival++)
-                    for (int jval = 0; jval < bs; jval++)
+                for (int ival = 0; ival < bs; ival++) {
+                    for (int jval = 0; jval < bs; jval++) {
                         vals(inz, ival, jval) =
                             base + static_cast<gko::remove_complex<value_type>>(
                                        ival * bs + jval);
+                    }
+                }
             }
         }
 
@@ -175,10 +178,12 @@ class FbcsrSample {
         gko::Array<IndexType> colids(exec, nbnz);
         gko::Array<IndexType> rowptrs(exec, nbrows + 1);
         const std::unique_ptr<const Fbcsr> fbmat = generate_fbcsr();
-        for (index_type i = 0; i < nbrows + 1; i++)
+        for (index_type i = 0; i < nbrows + 1; i++) {
             rowptrs.get_data()[i] = fbmat->get_const_row_ptrs()[i];
-        for (index_type i = 0; i < nbnz; i++)
+        }
+        for (index_type i = 0; i < nbnz; i++) {
             colids.get_data()[i] = fbmat->get_const_col_idxs()[i];
+        }
         return SparCsr::create(exec, gko::dim<2>{nbrows, nbcols}, colids,
                                rowptrs);
     }
@@ -302,7 +307,9 @@ class FbcsrSample2 {
         gko::Array<index_type> c(exec, {0, 0, 3, 2});
         gko::Array<value_type> vals(exec, nnz);
         value_type* const v = vals.get_data();
-        for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset;
+        for (IndexType i = 0; i < nnz; i++) {
+            v[i] = 0.15 + fbcsr_test_offset;
+        }
 
         v[0] = 1;
         v[1] = 3;
@@ -328,7 +335,9 @@ class FbcsrSample2 {
             exec, {0, 1, 0, 1, 0, 1, 6, 7, 0, 1, 6, 7, 4, 5, 4, 5});
         gko::Array<value_type> vals(exec, nnz);
         value_type* const v = vals.get_data();
-        for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset;
+        for (IndexType i = 0; i < nnz; i++) {
+            v[i] = 0.15 + fbcsr_test_offset;
+        }
         v[0] = 1;
         v[1] = 2;
         v[2] = 3;
@@ -401,7 +410,9 @@ class FbcsrSampleSquare {
         gko::Array<index_type> r(exec, {0, 1, 2});
         gko::Array<value_type> vals(exec, nnz);
         value_type* const v = vals.get_data();
-        for (IndexType i = 0; i < nnz; i++) v[i] = i;
+        for (IndexType i = 0; i < nnz; i++) {
+            v[i] = i;
+        }
 
         return Fbcsr::create(exec,
                              gko::dim<2>{static_cast<size_type>(nrows),
@@ -447,7 +458,9 @@ class FbcsrSampleComplex {
         gko::Array<index_type> c(exec, {0, 0, 3, 2});
         gko::Array<value_type> vals(exec, nnz);
         value_type* const v = vals.get_data();
-        for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset;
+        for (IndexType i = 0; i < nnz; i++) {
+            v[i] = 0.15 + fbcsr_test_offset;
+        }
 
         using namespace std::complex_literals;
         v[0] = 1.0 + 1.15i;
@@ -474,7 +487,9 @@ class FbcsrSampleComplex {
             exec, {0, 1, 0, 1, 0, 1, 6, 7, 0, 1, 6, 7, 4, 5, 4, 5});
         gko::Array<value_type> vals(exec, nnz);
         value_type* const v = vals.get_data();
-        for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset;
+        for (IndexType i = 0; i < nnz; i++) {
+            v[i] = 0.15 + fbcsr_test_offset;
+        }
 
         using namespace std::complex_literals;
         v[0] = 1.0 + 1.15i;