Add CUDA implementation of the DILU

preconditioner. Uses graph coloring to exploit parallelism in upper and triangular solves when computing a diagonal approximate inverse of a sparse matrix. Supports blocksizes up to 3.
OPM · Jan 25, 2024 · 4b0dd54 · 4b0dd54
1 parent 8837528
commit 4b0dd54
Show file tree

Hide file tree

Showing 8 changed files with 869 additions and 124 deletions.
diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
@@ -160,6 +160,7 @@ if(CUDA_FOUND)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuVector.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/vector_operations.cu)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuSparseMatrix.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuDILU.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuJac.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuSeqILU0.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/set_device.cpp)
@@ -172,6 +173,7 @@ if(CUDA_FOUND)
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cuda_check_last_error.hpp)
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp)
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp)
+  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuDILU.hpp)
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuJac.hpp)
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuVector.hpp)
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuSparseMatrix.hpp)

diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@@ -50,6 +50,7 @@
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerConvertFieldTypeAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
+#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
 #include <opm/simulators/linalg/cuistl/CuJac.hpp>
 
 #endif
@@ -270,6 +271,16 @@ struct StandardPreconditioners
             auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
             return wrapped;
         });
+
+        F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
+            using field_type = typename V::field_type;
+            using CuDILU = typename Opm::cuistl::CuDILU<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
+            auto cuDILU = std::make_shared<CuDILU>(op.getmat());
+
+            auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuDILU>>(cuDILU);
+            auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
+            return wrapped;
+        });
 #endif
     }
 
@@ -490,6 +501,25 @@ struct StandardPreconditioners<Operator,Dune::Amg::SequentialInformation>
             using CUJac = typename Opm::cuistl::CuJac<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
             return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUJac>>(std::make_shared<CUJac>(op.getmat(), w));
         });
+
+        F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
+            using field_type = typename V::field_type;
+            using CUDILU = typename Opm::cuistl::CuDILU<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
+            return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUDILU>>(std::make_shared<CUDILU>(op.getmat()));
+        });
+
+        F::addCreator("CUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
+            using block_type = typename V::block_type;
+            using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
+            using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
+            using CuDILU = typename Opm::cuistl::CuDILU<matrix_type_to, Opm::cuistl::CuVector<float>, Opm::cuistl::CuVector<float>>;
+            using Adapter = typename Opm::cuistl::PreconditionerAdapter<VTo, VTo, CuDILU>;
+            using Converter = typename Opm::cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
+            auto converted = std::make_shared<Converter>(op.getmat());
+            auto adapted = std::make_shared<Adapter>(std::make_shared<CuDILU>(converted->getConvertedMatrix()));
+            converted->setUnderlyingPreconditioner(adapted);
+            return converted;
+        });
 #endif
     }
 };

diff --git a/opm/simulators/linalg/cuistl/CuDILU.cpp b/opm/simulators/linalg/cuistl/CuDILU.cpp
@@ -0,0 +1,235 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <dune/common/fmatrix.hh>
+#include <dune/istl/bcrsmatrix.hh>
+#include <fmt/core.h>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
+#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
+#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
+#include <opm/simulators/linalg/matrixblock.hh>
+#include <vector>
+
+namespace
+{
+std::vector<int>
+createReorderedToNatural(Opm::SparseTable<size_t> levelSets)
+{
+    auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
+    int globCnt = 0;
+    for (auto row : levelSets) {
+        for (auto col : row) {
+            OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
+                         fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
+            res[globCnt++] = static_cast<int>(col);
+        }
+    }
+    return res;
+}
+
+std::vector<int>
+createNaturalToReordered(Opm::SparseTable<size_t> levelSets)
+{
+    auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
+    int globCnt = 0;
+    for (auto row : levelSets) {
+        for (auto col : row) {
+            OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
+                         fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
+            res[col] = globCnt++;
+        }
+    }
+    return res;
+}
+
+// TODO: When this function is called we already have the natural ordered matrix on the GPU
+// TODO: could it be possible to create the reordered one in a kernel to speed up the constructor?
+template <class M, class field_type>
+Opm::cuistl::CuSparseMatrix<field_type>
+createReorderedMatrix(const M& naturalMatrix, std::vector<int> reorderedToNatural)
+{
+    M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
+    for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
+        auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
+        // For elements in A
+        for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
+            dstRowIt.insert(elem.index());
+        }
+    }
+
+    // TODO: There is probably a faster way to copy by copying whole rows at a time
+    for (auto dstRowIt = reorderedMatrix.begin(); dstRowIt != reorderedMatrix.end(); ++dstRowIt) {
+        auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
+        for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
+            reorderedMatrix[dstRowIt.index()][elem.index()] = *elem;
+        }
+    }
+
+    return Opm::cuistl::CuSparseMatrix<field_type>::fromMatrix(reorderedMatrix, true);
+}
+
+} // NAMESPACE
+
+namespace Opm::cuistl
+{
+
+template <class M, class X, class Y, int l>
+CuDILU<M, X, Y, l>::CuDILU(const M& A)
+    : m_cpuMatrix(A)
+    , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
+    , m_reorderedToNatural(createReorderedToNatural(m_levelSets))
+    , m_naturalToReordered(createNaturalToReordered(m_levelSets))
+    , m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
+    , m_gpuMatrixReordered(createReorderedMatrix<M, field_type>(m_cpuMatrix, m_reorderedToNatural))
+    , m_gpuNaturalToReorder(m_naturalToReordered)
+    , m_gpuReorderToNatural(m_reorderedToNatural)
+    , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
+
+{
+    // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
+    // Some sanity check
+    OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
+                 fmt::format("CuSparse matrix not same size as DUNE matrix. {} vs {}.", m_gpuMatrix.N(), A.N()));
+    OPM_ERROR_IF(A[0][0].N() != m_gpuMatrix.blockSize(),
+                 fmt::format("CuSparse matrix not same blocksize as DUNE matrix. {} vs {}.",
+                             m_gpuMatrix.blockSize(),
+                             A[0][0].N()));
+    OPM_ERROR_IF(A.N() * A[0][0].N() != m_gpuMatrix.dim(),
+                 fmt::format("CuSparse matrix not same dimension as DUNE matrix. {} vs {}.",
+                             m_gpuMatrix.dim(),
+                             A.N() * A[0][0].N()));
+    OPM_ERROR_IF(A.nonzeroes() != m_gpuMatrix.nonzeroes(),
+                 fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
+                             m_gpuMatrix.nonzeroes(),
+                             A.nonzeroes()));
+    update();
+}
+
+template <class M, class X, class Y, int l>
+void
+CuDILU<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
+{
+}
+
+template <class M, class X, class Y, int l>
+void
+CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
+{
+    OPM_TIMEBLOCK(prec_apply);
+    int levelStartIdx = 0;
+    for (int level = 0; level < m_levelSets.size(); ++level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+        detail::computeLowerSolveLevelSet<field_type, blocksize_>(m_gpuMatrixReordered.getNonZeroValues().data(),
+                                                                  m_gpuMatrixReordered.getRowIndices().data(),
+                                                                  m_gpuMatrixReordered.getColumnIndices().data(),
+                                                                  m_gpuReorderToNatural.data(),
+                                                                  levelStartIdx,
+                                                                  numOfRowsInLevel,
+                                                                  m_gpuDInv.data(),
+                                                                  d.data(),
+                                                                  v.data());
+        levelStartIdx += numOfRowsInLevel;
+    }
+
+    levelStartIdx = m_cpuMatrix.N();
+    //  upper triangular solve: (D + U_A) v = Dy
+    for (int level = m_levelSets.size() - 1; level >= 0; --level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+        levelStartIdx -= numOfRowsInLevel;
+        detail::computeUpperSolveLevelSet<field_type, blocksize_>(m_gpuMatrixReordered.getNonZeroValues().data(),
+                                                                  m_gpuMatrixReordered.getRowIndices().data(),
+                                                                  m_gpuMatrixReordered.getColumnIndices().data(),
+                                                                  m_gpuReorderToNatural.data(),
+                                                                  levelStartIdx,
+                                                                  numOfRowsInLevel,
+                                                                  m_gpuDInv.data(),
+                                                                  v.data());
+    }
+}
+
+template <class M, class X, class Y, int l>
+void
+CuDILU<M, X, Y, l>::post([[maybe_unused]] X& x)
+{
+}
+
+template <class M, class X, class Y, int l>
+Dune::SolverCategory::Category
+CuDILU<M, X, Y, l>::category() const
+{
+    return Dune::SolverCategory::sequential;
+}
+
+template <class M, class X, class Y, int l>
+void
+CuDILU<M, X, Y, l>::update()
+{
+    OPM_TIMEBLOCK(prec_update);
+
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
+
+    detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
+                                                           m_gpuMatrix.getRowIndices().data(),
+                                                           m_gpuMatrixReordered.getNonZeroValues().data(),
+                                                           m_gpuMatrixReordered.getRowIndices().data(),
+                                                           m_gpuNaturalToReorder.data(),
+                                                           m_gpuMatrixReordered.N());
+
+    int levelStartIdx = 0;
+    for (int level = 0; level < m_levelSets.size(); ++level) {
+        const int numOfRowsInLevel = m_levelSets[level].size();
+
+        detail::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered.getNonZeroValues().data(),
+                                                            m_gpuMatrixReordered.getRowIndices().data(),
+                                                            m_gpuMatrixReordered.getColumnIndices().data(),
+                                                            m_gpuReorderToNatural.data(),
+                                                            m_gpuNaturalToReorder.data(),
+                                                            levelStartIdx,
+                                                            numOfRowsInLevel,
+                                                            m_gpuDInv.data());
+        levelStartIdx += numOfRowsInLevel;
+    }
+}
+
+} // namespace Opm::cuistl
+#define INSTANTIATE_CUDILU_DUNE(realtype, blockdim)                                                                    \
+    template class ::Opm::cuistl::CuDILU<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,            \
+                                         ::Opm::cuistl::CuVector<realtype>,                                            \
+                                         ::Opm::cuistl::CuVector<realtype>>;                                           \
+    template class ::Opm::cuistl::CuDILU<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,             \
+                                         ::Opm::cuistl::CuVector<realtype>,                                            \
+                                         ::Opm::cuistl::CuVector<realtype>>
+
+INSTANTIATE_CUDILU_DUNE(double, 1);
+INSTANTIATE_CUDILU_DUNE(double, 2);
+INSTANTIATE_CUDILU_DUNE(double, 3);
+INSTANTIATE_CUDILU_DUNE(double, 4);
+INSTANTIATE_CUDILU_DUNE(double, 5);
+INSTANTIATE_CUDILU_DUNE(double, 6);
+
+INSTANTIATE_CUDILU_DUNE(float, 1);
+INSTANTIATE_CUDILU_DUNE(float, 2);
+INSTANTIATE_CUDILU_DUNE(float, 3);
+INSTANTIATE_CUDILU_DUNE(float, 4);
+INSTANTIATE_CUDILU_DUNE(float, 5);
+INSTANTIATE_CUDILU_DUNE(float, 6);