Skip to content

Commit

Permalink
Merge Add SparsityCsr spmv with mixed precision support
Browse files Browse the repository at this point in the history
This PR adds SparsityCsr spmv copied from classical csr spmv with mixed precision.
It is mainly used by amgx_pgm, so it only use 2 subwarp_size for cuda/hip and 1 subgroup_size for dpcpp

Related PR: #970
  • Loading branch information
yhmtsai authored Feb 16, 2022
2 parents 9d2ad37 + ff428ce commit 7048448
Show file tree
Hide file tree
Showing 19 changed files with 790 additions and 453 deletions.
110 changes: 110 additions & 0 deletions common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/*******************************<GINKGO LICENSE>******************************
Copyright (c) 2017-2022, the Ginkgo authors
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/

namespace kernel {


template <size_type subwarp_size, typename MatrixValueType,
typename input_accessor, typename output_accessor, typename IndexType,
typename Closure>
__device__ void device_classical_spmv(const size_type num_rows,
const MatrixValueType* __restrict__ val,
const IndexType* __restrict__ col_idxs,
const IndexType* __restrict__ row_ptrs,
acc::range<input_accessor> b,
acc::range<output_accessor> c,
Closure scale)
{
using arithmetic_type = typename output_accessor::arithmetic_type;
auto subwarp_tile =
group::tiled_partition<subwarp_size>(group::this_thread_block());
const auto subrow = thread::get_subwarp_num_flat<subwarp_size>();
const auto subid = subwarp_tile.thread_rank();
const IndexType column_id = blockIdx.y;
const arithmetic_type value = val[0];
auto row = thread::get_subwarp_id_flat<subwarp_size>();
for (; row < num_rows; row += subrow) {
const auto ind_end = row_ptrs[row + 1];
arithmetic_type temp_val = zero<arithmetic_type>();
for (auto ind = row_ptrs[row] + subid; ind < ind_end;
ind += subwarp_size) {
temp_val += value * b(col_idxs[ind], column_id);
}
auto subwarp_result =
reduce(subwarp_tile, temp_val,
[](const arithmetic_type& a, const arithmetic_type& b) {
return a + b;
});
if (subid == 0) {
c(row, column_id) = scale(subwarp_result, c(row, column_id));
}
}
}


template <size_type subwarp_size, typename MatrixValueType,
typename input_accessor, typename output_accessor, typename IndexType>
__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
const size_type num_rows, const MatrixValueType* __restrict__ val,
const IndexType* __restrict__ col_idxs,
const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
acc::range<output_accessor> c)
{
using type = typename output_accessor::arithmetic_type;
device_classical_spmv<subwarp_size>(
num_rows, val, col_idxs, row_ptrs, b, c,
[](const type& x, const type& y) { return x; });
}


template <size_type subwarp_size, typename MatrixValueType,
typename input_accessor, typename output_accessor, typename IndexType>
__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
const size_type num_rows, const MatrixValueType* __restrict__ alpha,
const MatrixValueType* __restrict__ val,
const IndexType* __restrict__ col_idxs,
const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
const typename output_accessor::storage_type* __restrict__ beta,
acc::range<output_accessor> c)
{
using type = typename output_accessor::arithmetic_type;
const type alpha_val = alpha[0];
const type beta_val = beta[0];
device_classical_spmv<subwarp_size>(
num_rows, val, col_idxs, row_ptrs, b, c,
[&alpha_val, &beta_val](const type& x, const type& y) {
return alpha_val * x + beta_val * y;
});
}


} // namespace kernel
5 changes: 3 additions & 2 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -452,8 +452,9 @@ GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
namespace sparsity_csr {


GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_SPARSITY_CSR_COUNT_NUM_DIAGONAL_ELEMENTS_KERNEL);
Expand Down
13 changes: 8 additions & 5 deletions core/matrix/sparsity_csr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ template <typename ValueType, typename IndexType>
void SparsityCsr<ValueType, IndexType>::apply_impl(const LinOp* b,
LinOp* x) const
{
precision_dispatch_real_complex<ValueType>(
mixed_precision_dispatch_real_complex<ValueType>(
[this](auto dense_b, auto dense_x) {
this->get_executor()->run(
sparsity_csr::make_spmv(this, dense_b, dense_x));
Expand All @@ -91,12 +91,15 @@ void SparsityCsr<ValueType, IndexType>::apply_impl(const LinOp* alpha,
const LinOp* beta,
LinOp* x) const
{
precision_dispatch_real_complex<ValueType>(
[this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
mixed_precision_dispatch_real_complex<ValueType>(
[this, alpha, beta](auto dense_b, auto dense_x) {
auto dense_alpha = make_temporary_conversion<ValueType>(alpha);
auto dense_beta = make_temporary_conversion<
typename std::decay_t<decltype(*dense_x)>::value_type>(beta);
this->get_executor()->run(sparsity_csr::make_advanced_spmv(
dense_alpha, this, dense_b, dense_beta, dense_x));
dense_alpha.get(), this, dense_b, dense_beta.get(), dense_x));
},
alpha, b, beta, x);
b, x);
}


Expand Down
40 changes: 24 additions & 16 deletions core/matrix/sparsity_csr_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,22 @@ namespace gko {
namespace kernels {


#define GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(ValueType, IndexType) \
void spmv(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::SparsityCsr<ValueType, IndexType>* a, \
const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)

#define GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType) \
void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<ValueType>* alpha, \
const matrix::SparsityCsr<ValueType, IndexType>* a, \
const matrix::Dense<ValueType>* b, \
const matrix::Dense<ValueType>* beta, \
matrix::Dense<ValueType>* c)
#define GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(MatrixValueType, InputValueType, \
OutputValueType, IndexType) \
void spmv(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::SparsityCsr<MatrixValueType, IndexType>* a, \
const matrix::Dense<InputValueType>* b, \
matrix::Dense<OutputValueType>* c)

#define GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL( \
MatrixValueType, InputValueType, OutputValueType, IndexType) \
void advanced_spmv( \
std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<MatrixValueType>* alpha, \
const matrix::SparsityCsr<MatrixValueType, IndexType>* a, \
const matrix::Dense<InputValueType>* b, \
const matrix::Dense<OutputValueType>* beta, \
matrix::Dense<OutputValueType>* c)

#define GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType) \
void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec, \
Expand Down Expand Up @@ -98,10 +102,14 @@ namespace kernels {
bool* is_sorted)

#define GKO_DECLARE_ALL_AS_TEMPLATES \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType); \
template <typename MatrixValueType, typename InputValueType, \
typename OutputValueType, typename IndexType> \
GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(MatrixValueType, InputValueType, \
OutputValueType, IndexType); \
template <typename MatrixValueType, typename InputValueType, \
typename OutputValueType, typename IndexType> \
GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL( \
MatrixValueType, InputValueType, OutputValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
Expand Down
14 changes: 13 additions & 1 deletion core/multigrid/amgx_pgm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <ginkgo/core/matrix/dense.hpp>
#include <ginkgo/core/matrix/identity.hpp>
#include <ginkgo/core/matrix/row_gatherer.hpp>
#include <ginkgo/core/matrix/sparsity_csr.hpp>


#include "core/base/utils.hpp"
Expand Down Expand Up @@ -161,6 +162,16 @@ void AmgxPgm<ValueType, IndexType>::generate()
one<ValueType>()));
// TODO: implement the restrict_op from aggregation.
auto restrict_op = gko::as<csr_type>(share(prolong_csr->transpose()));
auto restrict_sparsity =
share(matrix::SparsityCsr<ValueType, IndexType>::create(
exec, restrict_op->get_size(),
restrict_op->get_num_stored_elements()));
exec->copy_from(exec.get(), static_cast<size_type>(num_agg + 1),
restrict_op->get_const_row_ptrs(),
restrict_sparsity->get_row_ptrs());
exec->copy_from(exec.get(), restrict_op->get_num_stored_elements(),
restrict_op->get_const_col_idxs(),
restrict_sparsity->get_col_idxs());

// Construct the coarse matrix
// TODO: use less memory footprint to improve it
Expand All @@ -170,7 +181,8 @@ void AmgxPgm<ValueType, IndexType>::generate()
amgxpgm_op->apply(prolong_csr.get(), tmp.get());
restrict_op->apply(tmp.get(), coarse_matrix.get());

this->set_multigrid_level(prolong_row_gather, coarse_matrix, restrict_op);
this->set_multigrid_level(prolong_row_gather, coarse_matrix,
restrict_sparsity);
}


Expand Down
35 changes: 25 additions & 10 deletions core/test/matrix/fbcsr_sample.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,22 +114,25 @@ class FbcsrSample {
static_cast<gko::acc::size_type>(bs)},
v);

if (mtx->get_size()[0] % bs != 0)
if (mtx->get_size()[0] % bs != 0) {
throw gko::BadDimension(__FILE__, __LINE__, __func__, "test fbcsr",
mtx->get_size()[0], mtx->get_size()[1],
"block size does not divide the size!");
}

for (index_type ibrow = 0; ibrow < mtx->get_num_block_rows(); ibrow++) {
const index_type* const browptr = mtx->get_row_ptrs();
for (index_type inz = browptr[ibrow]; inz < browptr[ibrow + 1];
inz++) {
const index_type bcolind = mtx->get_col_idxs()[inz];
const value_type base = (ibrow + 1) * (bcolind + 1);
for (int ival = 0; ival < bs; ival++)
for (int jval = 0; jval < bs; jval++)
for (int ival = 0; ival < bs; ival++) {
for (int jval = 0; jval < bs; jval++) {
vals(inz, ival, jval) =
base + static_cast<gko::remove_complex<value_type>>(
ival * bs + jval);
}
}
}
}

Expand Down Expand Up @@ -175,10 +178,12 @@ class FbcsrSample {
gko::Array<IndexType> colids(exec, nbnz);
gko::Array<IndexType> rowptrs(exec, nbrows + 1);
const std::unique_ptr<const Fbcsr> fbmat = generate_fbcsr();
for (index_type i = 0; i < nbrows + 1; i++)
for (index_type i = 0; i < nbrows + 1; i++) {
rowptrs.get_data()[i] = fbmat->get_const_row_ptrs()[i];
for (index_type i = 0; i < nbnz; i++)
}
for (index_type i = 0; i < nbnz; i++) {
colids.get_data()[i] = fbmat->get_const_col_idxs()[i];
}
return SparCsr::create(exec, gko::dim<2>{nbrows, nbcols}, colids,
rowptrs);
}
Expand Down Expand Up @@ -302,7 +307,9 @@ class FbcsrSample2 {
gko::Array<index_type> c(exec, {0, 0, 3, 2});
gko::Array<value_type> vals(exec, nnz);
value_type* const v = vals.get_data();
for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset;
for (IndexType i = 0; i < nnz; i++) {
v[i] = 0.15 + fbcsr_test_offset;
}

v[0] = 1;
v[1] = 3;
Expand All @@ -328,7 +335,9 @@ class FbcsrSample2 {
exec, {0, 1, 0, 1, 0, 1, 6, 7, 0, 1, 6, 7, 4, 5, 4, 5});
gko::Array<value_type> vals(exec, nnz);
value_type* const v = vals.get_data();
for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset;
for (IndexType i = 0; i < nnz; i++) {
v[i] = 0.15 + fbcsr_test_offset;
}
v[0] = 1;
v[1] = 2;
v[2] = 3;
Expand Down Expand Up @@ -401,7 +410,9 @@ class FbcsrSampleSquare {
gko::Array<index_type> r(exec, {0, 1, 2});
gko::Array<value_type> vals(exec, nnz);
value_type* const v = vals.get_data();
for (IndexType i = 0; i < nnz; i++) v[i] = i;
for (IndexType i = 0; i < nnz; i++) {
v[i] = i;
}

return Fbcsr::create(exec,
gko::dim<2>{static_cast<size_type>(nrows),
Expand Down Expand Up @@ -447,7 +458,9 @@ class FbcsrSampleComplex {
gko::Array<index_type> c(exec, {0, 0, 3, 2});
gko::Array<value_type> vals(exec, nnz);
value_type* const v = vals.get_data();
for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset;
for (IndexType i = 0; i < nnz; i++) {
v[i] = 0.15 + fbcsr_test_offset;
}

using namespace std::complex_literals;
v[0] = 1.0 + 1.15i;
Expand All @@ -474,7 +487,9 @@ class FbcsrSampleComplex {
exec, {0, 1, 0, 1, 0, 1, 6, 7, 0, 1, 6, 7, 4, 5, 4, 5});
gko::Array<value_type> vals(exec, nnz);
value_type* const v = vals.get_data();
for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset;
for (IndexType i = 0; i < nnz; i++) {
v[i] = 0.15 + fbcsr_test_offset;
}

using namespace std::complex_literals;
v[0] = 1.0 + 1.15i;
Expand Down
Loading

0 comments on commit 7048448

Please sign in to comment.