Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add compact factor storage support to triangular solvers #1072

Merged
merged 4 commits into from
Jul 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions core/solver/lower_trs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ template <typename ValueType, typename IndexType>
void LowerTrs<ValueType, IndexType>::generate()
{
if (this->get_system_matrix()) {
this->get_executor()->run(
lower_trs::make_generate(this->get_system_matrix().get(),
this->solve_struct_, parameters_.num_rhs));
this->get_executor()->run(lower_trs::make_generate(
this->get_system_matrix().get(), this->solve_struct_,
this->get_parameters().unit_diagonal, parameters_.num_rhs));
}
}

Expand Down Expand Up @@ -176,9 +176,10 @@ void LowerTrs<ValueType, IndexType>::apply_impl(const LinOp* b, LinOp* x) const
trans_x = this->template create_workspace_op<Vector>(
ws::transposed_x, gko::transpose(dense_x->get_size()));
}
exec->run(lower_trs::make_solve(lend(this->get_system_matrix()),
lend(this->solve_struct_), trans_b,
trans_x, dense_b, dense_x));
exec->run(lower_trs::make_solve(
lend(this->get_system_matrix()), lend(this->solve_struct_),
this->get_parameters().unit_diagonal, trans_b, trans_x, dense_b,
dense_x));
},
b, x);
}
Expand Down
4 changes: 2 additions & 2 deletions core/solver/lower_trs_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ namespace lower_trs {
void generate(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<_vtype, _itype>* matrix, \
std::shared_ptr<solver::SolveStruct>& solve_struct, \
const gko::size_type num_rhs)
bool unit_diag, const gko::size_type num_rhs)


#define GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL(_vtype, _itype) \
void solve(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<_vtype, _itype>* matrix, \
const solver::SolveStruct* solve_struct, \
const solver::SolveStruct* solve_struct, bool unit_diag, \
matrix::Dense<_vtype>* trans_b, matrix::Dense<_vtype>* trans_x, \
const matrix::Dense<_vtype>* b, matrix::Dense<_vtype>* x)

Expand Down
13 changes: 7 additions & 6 deletions core/solver/upper_trs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ template <typename ValueType, typename IndexType>
void UpperTrs<ValueType, IndexType>::generate()
{
if (this->get_system_matrix()) {
this->get_executor()->run(
upper_trs::make_generate(this->get_system_matrix().get(),
this->solve_struct_, parameters_.num_rhs));
this->get_executor()->run(upper_trs::make_generate(
this->get_system_matrix().get(), this->solve_struct_,
this->get_parameters().unit_diagonal, parameters_.num_rhs));
}
}

Expand Down Expand Up @@ -176,9 +176,10 @@ void UpperTrs<ValueType, IndexType>::apply_impl(const LinOp* b, LinOp* x) const
trans_x = this->template create_workspace_op<Vector>(
ws::transposed_x, gko::transpose(dense_x->get_size()));
}
exec->run(upper_trs::make_solve(lend(this->get_system_matrix()),
lend(this->solve_struct_), trans_b,
trans_x, dense_b, dense_x));
exec->run(upper_trs::make_solve(
lend(this->get_system_matrix()), lend(this->solve_struct_),
this->get_parameters().unit_diagonal, trans_b, trans_x, dense_b,
dense_x));
},
b, x);
}
Expand Down
4 changes: 2 additions & 2 deletions core/solver/upper_trs_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ namespace upper_trs {
void generate(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<_vtype, _itype>* matrix, \
std::shared_ptr<gko::solver::SolveStruct>& solve_struct, \
const gko::size_type num_rhs)
bool unit_diag, const gko::size_type num_rhs)


#define GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL(_vtype, _itype) \
void solve(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<_vtype, _itype>* matrix, \
const solver::SolveStruct* solve_struct, \
const solver::SolveStruct* solve_struct, bool unit_diag, \
matrix::Dense<_vtype>* trans_b, matrix::Dense<_vtype>* trans_x, \
const matrix::Dense<_vtype>* b, matrix::Dense<_vtype>* x)

Expand Down
7 changes: 7 additions & 0 deletions cuda/base/cusparse_bindings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -790,6 +790,13 @@ inline void set_mat_fill_mode(cusparseMatDescr_t descr,
}


inline void set_mat_diag_type(cusparseMatDescr_t descr,
cusparseDiagType_t diag_type)
{
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSetMatDiagType(descr, diag_type));
}


inline void destroy(cusparseMatDescr_t descr)
{
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyMatDescr(descr));
Expand Down
65 changes: 42 additions & 23 deletions cuda/solver/common_trs_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {

CudaSolveStruct(std::shared_ptr<const gko::CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
size_type num_rhs, bool is_upper)
size_type num_rhs, bool is_upper, bool unit_diag)
: handle{exec->get_cusparse_handle()},
spsm_descr{},
descr_a{},
Expand All @@ -107,7 +107,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
descr_a, CUSPARSE_SPMAT_FILL_MODE,
is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER);
cusparse::set_attribute<cusparseDiagType_t>(
descr_a, CUSPARSE_SPMAT_DIAG_TYPE, CUSPARSE_DIAG_TYPE_NON_UNIT);
descr_a, CUSPARSE_SPMAT_DIAG_TYPE,
unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT);

const auto rows = matrix->get_size()[0];
// workaround suggested by NVIDIA engineers: for some reason
Expand Down Expand Up @@ -193,7 +194,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {

CudaSolveStruct(std::shared_ptr<const gko::CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
size_type num_rhs, bool is_upper)
size_type num_rhs, bool is_upper, bool unit_diag)
: exec{exec},
handle{exec->get_cusparse_handle()},
algorithm{},
Expand All @@ -208,6 +209,9 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
cusparse::set_mat_fill_mode(
factor_descr,
is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER);
cusparse::set_mat_diag_type(
factor_descr,
unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT);
algorithm = 0;
policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

Expand Down Expand Up @@ -286,14 +290,15 @@ template <typename ValueType, typename IndexType>
void generate_kernel(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
std::shared_ptr<solver::SolveStruct>& solve_struct,
const gko::size_type num_rhs, bool is_upper)
const gko::size_type num_rhs, bool is_upper,
bool unit_diag)
{
if (matrix->get_size()[0] == 0) {
return;
}
if (cusparse::is_supported<ValueType, IndexType>::value) {
solve_struct = std::make_shared<CudaSolveStruct<ValueType, IndexType>>(
exec, matrix, num_rhs, is_upper);
exec, matrix, num_rhs, is_upper, unit_diag);
} else {
GKO_NOT_IMPLEMENTED;
}
Expand Down Expand Up @@ -377,7 +382,8 @@ __global__ void sptrsv_naive_caching_kernel(
const IndexType* const rowptrs, const IndexType* const colidxs,
const ValueType* const vals, const ValueType* const b, size_type b_stride,
ValueType* const x, size_type x_stride, const size_type n,
const size_type nrhs, bool* nan_produced, IndexType* atomic_counter)
const size_type nrhs, bool unit_diag, bool* nan_produced,
IndexType* atomic_counter)
{
__shared__ uninitialized_array<ValueType, default_block_size> x_s_array;
__shared__ IndexType block_base_idx;
Expand Down Expand Up @@ -409,12 +415,16 @@ __global__ void sptrsv_naive_caching_kernel(
// upper tri matrix: start at last entry (row_end - 1), run backward
// until first entry, which is the diagonal entry
const auto row_begin = is_upper ? rowptrs[row + 1] - 1 : rowptrs[row];
const auto row_diag = is_upper ? rowptrs[row] : rowptrs[row + 1] - 1;
const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1];
const int row_step = is_upper ? -1 : 1;

ValueType sum = 0.0;
for (auto i = row_begin; i != row_diag; i += row_step) {
auto sum = zero<ValueType>();
auto i = row_begin;
for (; i != row_end; i += row_step) {
const auto dependency = colidxs[i];
if (is_upper ? dependency <= row : dependency >= row) {
break;
}
auto x_p = &x[dependency * x_stride + rhs];

const auto dependency_gid = is_upper ? (n - 1 - dependency) * nrhs + rhs
Expand All @@ -434,7 +444,9 @@ __global__ void sptrsv_naive_caching_kernel(
sum += x * vals[i];
}

const auto r = (b[row * b_stride + rhs] - sum) / vals[row_diag];
// The first entry past the triangular part will be the diagonal
const auto diag = unit_diag ? one<ValueType>() : vals[i];
const auto r = (b[row * b_stride + rhs] - sum) / diag;

store(x_s, self_shid, r);
x[row * x_stride + rhs] = r;
Expand All @@ -453,7 +465,8 @@ __global__ void sptrsv_naive_legacy_kernel(
const IndexType* const rowptrs, const IndexType* const colidxs,
const ValueType* const vals, const ValueType* const b, size_type b_stride,
ValueType* const x, size_type x_stride, const size_type n,
const size_type nrhs, bool* nan_produced, IndexType* atomic_counter)
const size_type nrhs, bool unit_diag, bool* nan_produced,
IndexType* atomic_counter)
{
__shared__ IndexType block_base_idx;
if (threadIdx.x == 0) {
Expand All @@ -470,29 +483,35 @@ __global__ void sptrsv_naive_legacy_kernel(
return;
}

// lower tri matrix: start at beginning, run forward until last entry,
// (row_end - 1) which is the diagonal entry
// lower tri matrix: start at beginning, run forward
// upper tri matrix: start at last entry (row_end - 1), run backward
// until first entry, which is the diagonal entry
const auto row_begin = is_upper ? rowptrs[row + 1] - 1 : rowptrs[row];
const auto row_diag = is_upper ? rowptrs[row] : rowptrs[row + 1] - 1;
const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1];
const int row_step = is_upper ? -1 : 1;

ValueType sum = 0.0;
auto j = row_begin;
while (j != row_diag + row_step) {
auto col = colidxs[j];
auto col = colidxs[j];
while (j != row_end) {
auto x_val = load(x, col * x_stride + rhs);
while (!is_nan(x_val)) {
sum += vals[j] * x_val;
j += row_step;
col = colidxs[j];
x_val = load(x, col * x_stride + rhs);
}
if (row == col) {
const auto r = (b[row * b_stride + rhs] - sum) / vals[row_diag];
// to avoid the kernel hanging on matrices without diagonal,
// we bail out if we are past the triangle, even if it's not
// the diagonal entry. This may lead to incorrect results,
// but prevents an infinite loop.
if (is_upper ? row >= col : row <= col) {
// assert(row == col);
auto diag = unit_diag ? one<ValueType>() : vals[j];
const auto r = (b[row * b_stride + rhs] - sum) / diag;
store(x, row * x_stride + rhs, r);
j += row_step;
// after we encountered the diagonal, we are done
// this also skips entries outside the triangle
j = row_end;
if (is_nan(r)) {
store(x, row * x_stride + rhs, zero<ValueType>());
*nan_produced = true;
Expand All @@ -514,7 +533,7 @@ __global__ void sptrsv_init_kernel(bool* const nan_produced,
template <bool is_upper, typename ValueType, typename IndexType>
void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
const matrix::Dense<ValueType>* b,
bool unit_diag, const matrix::Dense<ValueType>* b,
matrix::Dense<ValueType>* x)
{
// Pre-Volta GPUs may deadlock due to missing independent thread scheduling.
Expand All @@ -540,14 +559,14 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
as_cuda_type(matrix->get_const_values()),
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(x->get_values()), x->get_stride(), n, nrhs,
as_cuda_type(x->get_values()), x->get_stride(), n, nrhs, unit_diag,
nan_produced.get_data(), atomic_counter.get_data());
} else {
sptrsv_naive_caching_kernel<is_upper><<<grid_size, block_size>>>(
matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
as_cuda_type(matrix->get_const_values()),
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(x->get_values()), x->get_stride(), n, nrhs,
as_cuda_type(x->get_values()), x->get_stride(), n, nrhs, unit_diag,
nan_produced.get_data(), atomic_counter.get_data());
}

Expand Down
8 changes: 4 additions & 4 deletions cuda/solver/lower_trs_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,11 @@ template <typename ValueType, typename IndexType>
void generate(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
std::shared_ptr<solver::SolveStruct>& solve_struct,
const gko::size_type num_rhs)
bool unit_diag, const gko::size_type num_rhs)
{
if (matrix->get_strategy()->get_name() == "sparselib") {
generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct,
num_rhs, false);
num_rhs, false, unit_diag);
}
}

Expand All @@ -88,15 +88,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
template <typename ValueType, typename IndexType>
void solve(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
const solver::SolveStruct* solve_struct,
const solver::SolveStruct* solve_struct, bool unit_diag,
matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* x)
{
if (matrix->get_strategy()->get_name() == "sparselib") {
solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
trans_x, b, x);
} else {
sptrsv_naive_caching<false>(exec, matrix, b, x);
sptrsv_naive_caching<false>(exec, matrix, unit_diag, b, x);
}
}

Expand Down
8 changes: 4 additions & 4 deletions cuda/solver/upper_trs_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,11 @@ template <typename ValueType, typename IndexType>
void generate(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
std::shared_ptr<solver::SolveStruct>& solve_struct,
const gko::size_type num_rhs)
bool unit_diag, const gko::size_type num_rhs)
{
if (matrix->get_strategy()->get_name() == "sparselib") {
generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct,
num_rhs, true);
num_rhs, true, unit_diag);
}
}

Expand All @@ -88,15 +88,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
template <typename ValueType, typename IndexType>
void solve(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
const solver::SolveStruct* solve_struct,
const solver::SolveStruct* solve_struct, bool unit_diag,
matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* x)
{
if (matrix->get_strategy()->get_name() == "sparselib") {
solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
trans_x, b, x);
} else {
sptrsv_naive_caching<true>(exec, matrix, b, x);
sptrsv_naive_caching<true>(exec, matrix, unit_diag, b, x);
}
}

Expand Down
5 changes: 5 additions & 0 deletions dev_tools/scripts/config
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
- FixInclude: "common/unified/base/kernel_launch_solver.hpp"
- "test/base/kernel_launch_generic.cpp"
- FixInclude: "common/unified/base/kernel_launch.hpp"
- "^test/solver/(lower|upper)_trs_kernels.cpp"
- CoreSuffix: "_kernels"
- PathPrefix: "ginkgo/core"
- PathIgnore: "0"
- RemoveTest: "true"
- "elimination_forest\.cpp"
- FixInclude: "core/factorization/elimination_forest.hpp"
- "common/unified/.*.cpp"
Expand Down
4 changes: 2 additions & 2 deletions dpcpp/solver/lower_trs_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ template <typename ValueType, typename IndexType>
void generate(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
std::shared_ptr<solver::SolveStruct>& solve_struct,
const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
bool unit_diag, const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
Expand All @@ -83,7 +83,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
template <typename ValueType, typename IndexType>
void solve(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
const solver::SolveStruct* solve_struct,
const solver::SolveStruct* solve_struct, bool unit_diag,
matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
const matrix::Dense<ValueType>* b,
matrix::Dense<ValueType>* x) GKO_NOT_IMPLEMENTED;
Expand Down
4 changes: 2 additions & 2 deletions dpcpp/solver/upper_trs_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ template <typename ValueType, typename IndexType>
void generate(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
std::shared_ptr<solver::SolveStruct>& solve_struct,
const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
bool unit_diag, const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
Expand All @@ -83,7 +83,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
template <typename ValueType, typename IndexType>
void solve(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
const solver::SolveStruct* solve_struct,
const solver::SolveStruct* solve_struct, bool unit_diag,
matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
const matrix::Dense<ValueType>* b,
matrix::Dense<ValueType>* x) GKO_NOT_IMPLEMENTED;
Expand Down
Loading