Skip to content

Commit

Permalink
Add CUDA implementation of the DILU
Browse files Browse the repository at this point in the history
preconditioner. Uses graph coloring to exploit
parallelism in upper and triangular solves when
computing a diagonal approximate inverse of a
sparse matrix. Supports blocksizes up to 3.
  • Loading branch information
Tobias Meyer Andersen committed Jan 25, 2024
1 parent 8837528 commit 4b0dd54
Show file tree
Hide file tree
Showing 8 changed files with 869 additions and 124 deletions.
2 changes: 2 additions & 0 deletions CMakeLists_files.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ if(CUDA_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuVector.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/vector_operations.cu)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuSparseMatrix.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuDILU.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuJac.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuSeqILU0.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/set_device.cpp)
Expand All @@ -172,6 +173,7 @@ if(CUDA_FOUND)
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cuda_check_last_error.hpp)
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp)
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp)
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuDILU.hpp)
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuJac.hpp)
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuVector.hpp)
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuSparseMatrix.hpp)
Expand Down
30 changes: 30 additions & 0 deletions opm/simulators/linalg/PreconditionerFactory_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
#include <opm/simulators/linalg/cuistl/PreconditionerConvertFieldTypeAdapter.hpp>
#include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
#include <opm/simulators/linalg/cuistl/CuJac.hpp>

#endif
Expand Down Expand Up @@ -270,6 +271,16 @@ struct StandardPreconditioners
auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
return wrapped;
});

F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
using field_type = typename V::field_type;
using CuDILU = typename Opm::cuistl::CuDILU<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
auto cuDILU = std::make_shared<CuDILU>(op.getmat());

auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuDILU>>(cuDILU);
auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
return wrapped;
});
#endif
}

Expand Down Expand Up @@ -490,6 +501,25 @@ struct StandardPreconditioners<Operator,Dune::Amg::SequentialInformation>
using CUJac = typename Opm::cuistl::CuJac<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUJac>>(std::make_shared<CUJac>(op.getmat(), w));
});

F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
using field_type = typename V::field_type;
using CUDILU = typename Opm::cuistl::CuDILU<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUDILU>>(std::make_shared<CUDILU>(op.getmat()));
});

F::addCreator("CUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
using block_type = typename V::block_type;
using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
using CuDILU = typename Opm::cuistl::CuDILU<matrix_type_to, Opm::cuistl::CuVector<float>, Opm::cuistl::CuVector<float>>;
using Adapter = typename Opm::cuistl::PreconditionerAdapter<VTo, VTo, CuDILU>;
using Converter = typename Opm::cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
auto converted = std::make_shared<Converter>(op.getmat());
auto adapted = std::make_shared<Adapter>(std::make_shared<CuDILU>(converted->getConvertedMatrix()));
converted->setUnderlyingPreconditioner(adapted);
return converted;
});
#endif
}
};
Expand Down
235 changes: 235 additions & 0 deletions opm/simulators/linalg/cuistl/CuDILU.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
/*
Copyright 2022-2023 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include <dune/common/fmatrix.hh>
#include <dune/istl/bcrsmatrix.hh>
#include <fmt/core.h>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
#include <opm/simulators/linalg/matrixblock.hh>
#include <vector>

namespace
{
std::vector<int>
createReorderedToNatural(Opm::SparseTable<size_t> levelSets)
{
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
int globCnt = 0;
for (auto row : levelSets) {
for (auto col : row) {
OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
res[globCnt++] = static_cast<int>(col);
}
}
return res;
}

std::vector<int>
createNaturalToReordered(Opm::SparseTable<size_t> levelSets)
{
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
int globCnt = 0;
for (auto row : levelSets) {
for (auto col : row) {
OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
res[col] = globCnt++;
}
}
return res;
}

// TODO: When this function is called we already have the natural ordered matrix on the GPU
// TODO: could it be possible to create the reordered one in a kernel to speed up the constructor?
template <class M, class field_type>
Opm::cuistl::CuSparseMatrix<field_type>
createReorderedMatrix(const M& naturalMatrix, std::vector<int> reorderedToNatural)
{
M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
// For elements in A
for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
dstRowIt.insert(elem.index());
}
}

// TODO: There is probably a faster way to copy by copying whole rows at a time
for (auto dstRowIt = reorderedMatrix.begin(); dstRowIt != reorderedMatrix.end(); ++dstRowIt) {
auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
reorderedMatrix[dstRowIt.index()][elem.index()] = *elem;
}
}

return Opm::cuistl::CuSparseMatrix<field_type>::fromMatrix(reorderedMatrix, true);
}

} // NAMESPACE

namespace Opm::cuistl
{

template <class M, class X, class Y, int l>
CuDILU<M, X, Y, l>::CuDILU(const M& A)
: m_cpuMatrix(A)
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
, m_reorderedToNatural(createReorderedToNatural(m_levelSets))
, m_naturalToReordered(createNaturalToReordered(m_levelSets))
, m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
, m_gpuMatrixReordered(createReorderedMatrix<M, field_type>(m_cpuMatrix, m_reorderedToNatural))
, m_gpuNaturalToReorder(m_naturalToReordered)
, m_gpuReorderToNatural(m_reorderedToNatural)
, m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())

{
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
// Some sanity check
OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
fmt::format("CuSparse matrix not same size as DUNE matrix. {} vs {}.", m_gpuMatrix.N(), A.N()));
OPM_ERROR_IF(A[0][0].N() != m_gpuMatrix.blockSize(),
fmt::format("CuSparse matrix not same blocksize as DUNE matrix. {} vs {}.",
m_gpuMatrix.blockSize(),
A[0][0].N()));
OPM_ERROR_IF(A.N() * A[0][0].N() != m_gpuMatrix.dim(),
fmt::format("CuSparse matrix not same dimension as DUNE matrix. {} vs {}.",
m_gpuMatrix.dim(),
A.N() * A[0][0].N()));
OPM_ERROR_IF(A.nonzeroes() != m_gpuMatrix.nonzeroes(),
fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
m_gpuMatrix.nonzeroes(),
A.nonzeroes()));
update();
}

template <class M, class X, class Y, int l>
void
CuDILU<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
{
}

template <class M, class X, class Y, int l>
void
CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
{
OPM_TIMEBLOCK(prec_apply);
int levelStartIdx = 0;
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();
detail::computeLowerSolveLevelSet<field_type, blocksize_>(m_gpuMatrixReordered.getNonZeroValues().data(),
m_gpuMatrixReordered.getRowIndices().data(),
m_gpuMatrixReordered.getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
d.data(),
v.data());
levelStartIdx += numOfRowsInLevel;
}

levelStartIdx = m_cpuMatrix.N();
// upper triangular solve: (D + U_A) v = Dy
for (int level = m_levelSets.size() - 1; level >= 0; --level) {
const int numOfRowsInLevel = m_levelSets[level].size();
levelStartIdx -= numOfRowsInLevel;
detail::computeUpperSolveLevelSet<field_type, blocksize_>(m_gpuMatrixReordered.getNonZeroValues().data(),
m_gpuMatrixReordered.getRowIndices().data(),
m_gpuMatrixReordered.getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data(),
v.data());
}
}

template <class M, class X, class Y, int l>
void
CuDILU<M, X, Y, l>::post([[maybe_unused]] X& x)
{
}

template <class M, class X, class Y, int l>
Dune::SolverCategory::Category
CuDILU<M, X, Y, l>::category() const
{
return Dune::SolverCategory::sequential;
}

template <class M, class X, class Y, int l>
void
CuDILU<M, X, Y, l>::update()
{
OPM_TIMEBLOCK(prec_update);

m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu

detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
m_gpuMatrix.getRowIndices().data(),
m_gpuMatrixReordered.getNonZeroValues().data(),
m_gpuMatrixReordered.getRowIndices().data(),
m_gpuNaturalToReorder.data(),
m_gpuMatrixReordered.N());

int levelStartIdx = 0;
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();

detail::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered.getNonZeroValues().data(),
m_gpuMatrixReordered.getRowIndices().data(),
m_gpuMatrixReordered.getColumnIndices().data(),
m_gpuReorderToNatural.data(),
m_gpuNaturalToReorder.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuDInv.data());
levelStartIdx += numOfRowsInLevel;
}
}

} // namespace Opm::cuistl
#define INSTANTIATE_CUDILU_DUNE(realtype, blockdim) \
template class ::Opm::cuistl::CuDILU<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>, \
::Opm::cuistl::CuVector<realtype>, \
::Opm::cuistl::CuVector<realtype>>; \
template class ::Opm::cuistl::CuDILU<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>, \
::Opm::cuistl::CuVector<realtype>, \
::Opm::cuistl::CuVector<realtype>>

INSTANTIATE_CUDILU_DUNE(double, 1);
INSTANTIATE_CUDILU_DUNE(double, 2);
INSTANTIATE_CUDILU_DUNE(double, 3);
INSTANTIATE_CUDILU_DUNE(double, 4);
INSTANTIATE_CUDILU_DUNE(double, 5);
INSTANTIATE_CUDILU_DUNE(double, 6);

INSTANTIATE_CUDILU_DUNE(float, 1);
INSTANTIATE_CUDILU_DUNE(float, 2);
INSTANTIATE_CUDILU_DUNE(float, 3);
INSTANTIATE_CUDILU_DUNE(float, 4);
INSTANTIATE_CUDILU_DUNE(float, 5);
INSTANTIATE_CUDILU_DUNE(float, 6);
Loading

0 comments on commit 4b0dd54

Please sign in to comment.