diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 472d9e5a60a..80955007954 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -160,6 +160,7 @@ if(CUDA_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuVector.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/vector_operations.cu) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuSparseMatrix.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuDILU.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuJac.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuSeqILU0.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/set_device.cpp) @@ -172,6 +173,7 @@ if(CUDA_FOUND) list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cuda_check_last_error.hpp) list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp) list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp) + list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuDILU.hpp) list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuJac.hpp) list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuVector.hpp) list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuSparseMatrix.hpp) diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp index 23b849d2b26..ba81df3ce86 100644 --- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp +++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp @@ -50,6 +50,7 @@ #include #include #include +#include #include #endif @@ -270,6 +271,16 @@ struct StandardPreconditioners auto wrapped = std::make_shared>(adapted, comm); return wrapped; }); + + F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t, const C& comm) { + using field_type = typename V::field_type; + using CuDILU = typename Opm::cuistl::CuDILU, Opm::cuistl::CuVector>; + auto cuDILU = std::make_shared(op.getmat()); + + auto adapted = std::make_shared>(cuDILU); + auto wrapped = std::make_shared>(adapted, comm); + return wrapped; + }); #endif } @@ -490,6 +501,25 @@ struct StandardPreconditioners using CUJac = typename Opm::cuistl::CuJac, Opm::cuistl::CuVector>; return std::make_shared>(std::make_shared(op.getmat(), w)); }); + + F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { + using field_type = typename V::field_type; + using CUDILU = typename Opm::cuistl::CuDILU, Opm::cuistl::CuVector>; + return std::make_shared>(std::make_shared(op.getmat())); + }); + + F::addCreator("CUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { + using block_type = typename V::block_type; + using VTo = Dune::BlockVector>; + using matrix_type_to = typename Dune::BCRSMatrix>; + using CuDILU = typename Opm::cuistl::CuDILU, Opm::cuistl::CuVector>; + using Adapter = typename Opm::cuistl::PreconditionerAdapter; + using Converter = typename Opm::cuistl::PreconditionerConvertFieldTypeAdapter; + auto converted = std::make_shared(op.getmat()); + auto adapted = std::make_shared(std::make_shared(converted->getConvertedMatrix())); + converted->setUnderlyingPreconditioner(adapted); + return converted; + }); #endif } }; diff --git a/opm/simulators/linalg/cuistl/CuDILU.cpp b/opm/simulators/linalg/cuistl/CuDILU.cpp new file mode 100644 index 00000000000..2033791e4f6 --- /dev/null +++ b/opm/simulators/linalg/cuistl/CuDILU.cpp @@ -0,0 +1,235 @@ +/* + Copyright 2022-2023 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace +{ +std::vector +createReorderedToNatural(Opm::SparseTable levelSets) +{ + auto res = std::vector(Opm::cuistl::detail::to_size_t(levelSets.dataSize())); + int globCnt = 0; + for (auto row : levelSets) { + for (auto col : row) { + OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(), + fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size())); + res[globCnt++] = static_cast(col); + } + } + return res; +} + +std::vector +createNaturalToReordered(Opm::SparseTable levelSets) +{ + auto res = std::vector(Opm::cuistl::detail::to_size_t(levelSets.dataSize())); + int globCnt = 0; + for (auto row : levelSets) { + for (auto col : row) { + OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(), + fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size())); + res[col] = globCnt++; + } + } + return res; +} + +// TODO: When this function is called we already have the natural ordered matrix on the GPU +// TODO: could it be possible to create the reordered one in a kernel to speed up the constructor? +template +Opm::cuistl::CuSparseMatrix +createReorderedMatrix(const M& naturalMatrix, std::vector reorderedToNatural) +{ + M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise); + for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) { + auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()]; + // For elements in A + for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) { + dstRowIt.insert(elem.index()); + } + } + + // TODO: There is probably a faster way to copy by copying whole rows at a time + for (auto dstRowIt = reorderedMatrix.begin(); dstRowIt != reorderedMatrix.end(); ++dstRowIt) { + auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()]; + for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) { + reorderedMatrix[dstRowIt.index()][elem.index()] = *elem; + } + } + + return Opm::cuistl::CuSparseMatrix::fromMatrix(reorderedMatrix, true); +} + +} // NAMESPACE + +namespace Opm::cuistl +{ + +template +CuDILU::CuDILU(const M& A) + : m_cpuMatrix(A) + , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER)) + , m_reorderedToNatural(createReorderedToNatural(m_levelSets)) + , m_naturalToReordered(createNaturalToReordered(m_levelSets)) + , m_gpuMatrix(CuSparseMatrix::fromMatrix(m_cpuMatrix, true)) + , m_gpuMatrixReordered(createReorderedMatrix(m_cpuMatrix, m_reorderedToNatural)) + , m_gpuNaturalToReorder(m_naturalToReordered) + , m_gpuReorderToNatural(m_reorderedToNatural) + , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()) + +{ + // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? + // Some sanity check + OPM_ERROR_IF(A.N() != m_gpuMatrix.N(), + fmt::format("CuSparse matrix not same size as DUNE matrix. {} vs {}.", m_gpuMatrix.N(), A.N())); + OPM_ERROR_IF(A[0][0].N() != m_gpuMatrix.blockSize(), + fmt::format("CuSparse matrix not same blocksize as DUNE matrix. {} vs {}.", + m_gpuMatrix.blockSize(), + A[0][0].N())); + OPM_ERROR_IF(A.N() * A[0][0].N() != m_gpuMatrix.dim(), + fmt::format("CuSparse matrix not same dimension as DUNE matrix. {} vs {}.", + m_gpuMatrix.dim(), + A.N() * A[0][0].N())); + OPM_ERROR_IF(A.nonzeroes() != m_gpuMatrix.nonzeroes(), + fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ", + m_gpuMatrix.nonzeroes(), + A.nonzeroes())); + update(); +} + +template +void +CuDILU::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b) +{ +} + +template +void +CuDILU::apply(X& v, const Y& d) +{ + OPM_TIMEBLOCK(prec_apply); + int levelStartIdx = 0; + for (int level = 0; level < m_levelSets.size(); ++level) { + const int numOfRowsInLevel = m_levelSets[level].size(); + detail::computeLowerSolveLevelSet(m_gpuMatrixReordered.getNonZeroValues().data(), + m_gpuMatrixReordered.getRowIndices().data(), + m_gpuMatrixReordered.getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + d.data(), + v.data()); + levelStartIdx += numOfRowsInLevel; + } + + levelStartIdx = m_cpuMatrix.N(); + // upper triangular solve: (D + U_A) v = Dy + for (int level = m_levelSets.size() - 1; level >= 0; --level) { + const int numOfRowsInLevel = m_levelSets[level].size(); + levelStartIdx -= numOfRowsInLevel; + detail::computeUpperSolveLevelSet(m_gpuMatrixReordered.getNonZeroValues().data(), + m_gpuMatrixReordered.getRowIndices().data(), + m_gpuMatrixReordered.getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + v.data()); + } +} + +template +void +CuDILU::post([[maybe_unused]] X& x) +{ +} + +template +Dune::SolverCategory::Category +CuDILU::category() const +{ + return Dune::SolverCategory::sequential; +} + +template +void +CuDILU::update() +{ + OPM_TIMEBLOCK(prec_update); + + m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu + + detail::copyMatDataToReordered(m_gpuMatrix.getNonZeroValues().data(), + m_gpuMatrix.getRowIndices().data(), + m_gpuMatrixReordered.getNonZeroValues().data(), + m_gpuMatrixReordered.getRowIndices().data(), + m_gpuNaturalToReorder.data(), + m_gpuMatrixReordered.N()); + + int levelStartIdx = 0; + for (int level = 0; level < m_levelSets.size(); ++level) { + const int numOfRowsInLevel = m_levelSets[level].size(); + + detail::computeDiluDiagonal(m_gpuMatrixReordered.getNonZeroValues().data(), + m_gpuMatrixReordered.getRowIndices().data(), + m_gpuMatrixReordered.getColumnIndices().data(), + m_gpuReorderToNatural.data(), + m_gpuNaturalToReorder.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data()); + levelStartIdx += numOfRowsInLevel; + } +} + +} // namespace Opm::cuistl +#define INSTANTIATE_CUDILU_DUNE(realtype, blockdim) \ + template class ::Opm::cuistl::CuDILU>, \ + ::Opm::cuistl::CuVector, \ + ::Opm::cuistl::CuVector>; \ + template class ::Opm::cuistl::CuDILU>, \ + ::Opm::cuistl::CuVector, \ + ::Opm::cuistl::CuVector> + +INSTANTIATE_CUDILU_DUNE(double, 1); +INSTANTIATE_CUDILU_DUNE(double, 2); +INSTANTIATE_CUDILU_DUNE(double, 3); +INSTANTIATE_CUDILU_DUNE(double, 4); +INSTANTIATE_CUDILU_DUNE(double, 5); +INSTANTIATE_CUDILU_DUNE(double, 6); + +INSTANTIATE_CUDILU_DUNE(float, 1); +INSTANTIATE_CUDILU_DUNE(float, 2); +INSTANTIATE_CUDILU_DUNE(float, 3); +INSTANTIATE_CUDILU_DUNE(float, 4); +INSTANTIATE_CUDILU_DUNE(float, 5); +INSTANTIATE_CUDILU_DUNE(float, 6); diff --git a/opm/simulators/linalg/cuistl/CuDILU.hpp b/opm/simulators/linalg/cuistl/CuDILU.hpp new file mode 100644 index 00000000000..a48c308be1a --- /dev/null +++ b/opm/simulators/linalg/cuistl/CuDILU.hpp @@ -0,0 +1,121 @@ +/* + Copyright 2022-2023 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ +#ifndef OPM_CUDILU_HPP +#define OPM_CUDILU_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +namespace Opm::cuistl +{ +//! \brief DILU preconditioner on the GPU. +//! +//! \tparam M The matrix type to operate on +//! \tparam X Type of the update +//! \tparam Y Type of the defect +//! \tparam l Ignored. Just there to have the same number of template arguments +//! as other preconditioners. +//! +//! \note We assume X and Y are both CuVector, but we leave them as template +//! arguments in case of future additions. +template +class CuDILU : public Dune::PreconditionerWithUpdate +{ +public: + //! \brief The matrix type the preconditioner is for. + using matrix_type = typename std::remove_const::type; + //! \brief The domain type of the preconditioner. + using domain_type = X; + //! \brief The range type of the preconditioner. + using range_type = Y; + //! \brief The field type of the preconditioner. + using field_type = typename X::field_type; + + //! \brief Constructor. + //! + //! Constructor gets all parameters to operate the prec. + //! \param A The matrix to operate on. + //! \param w The relaxation factor. + //! + explicit CuDILU(const M& A); + + //! \brief Prepare the preconditioner. + //! \note Does nothing at the time being. + void pre(X& x, Y& b) override; + + //! \brief Apply the preconditoner. + void apply(X& v, const Y& d) override; + + //! \brief Post processing + //! \note Does nothing at the moment + void post(X& x) override; + + //! Category of the preconditioner (see SolverCategory::Category) + Dune::SolverCategory::Category category() const override; + + //! \brief Updates the matrix data. + void update() final; + + + //! \returns false + static constexpr bool shouldCallPre() + { + return false; + } + + //! \returns false + static constexpr bool shouldCallPost() + { + return false; + } + + +private: + //! \brief Reference to the underlying matrix + const M& m_cpuMatrix; + //! \brief size_t describing the dimensions of the square block elements + static constexpr const size_t blocksize_ = matrix_type::block_type::cols; + //! \brief SparseTable storing each row by level + Opm::SparseTable m_levelSets; + //! \brief converts from index in reordered structure to index natural ordered structure + std::vector m_reorderedToNatural; + //! \brief converts from index in natural ordered structure to index reordered strucutre + std::vector m_naturalToReordered; + //! \brief The A matrix stored on the gpu, and its reordred version + CuSparseMatrix m_gpuMatrix; + CuSparseMatrix m_gpuMatrixReordered; + //! row conversion from natural to reordered matrix indices stored on the GPU + CuVector m_gpuNaturalToReorder; + //! row conversion from reordered to natural matrix indices stored on the GPU + CuVector m_gpuReorderToNatural; + //! \brief Stores the inverted diagonal that we use in DILU + CuVector m_gpuDInv; +}; +} // end namespace Opm::cuistl + +#endif diff --git a/opm/simulators/linalg/cuistl/CuJac.cpp b/opm/simulators/linalg/cuistl/CuJac.cpp index 9472d51f41b..1a67640a03e 100644 --- a/opm/simulators/linalg/cuistl/CuJac.cpp +++ b/opm/simulators/linalg/cuistl/CuJac.cpp @@ -67,12 +67,11 @@ CuJac::CuJac(const M& A, field_type w) A.nonzeroes())); // Compute the inverted diagonal of A and store it in a vector format in m_diagInvFlattened - detail::invertDiagonalAndFlatten(m_gpuMatrix.getNonZeroValues().data(), - m_gpuMatrix.getRowIndices().data(), - m_gpuMatrix.getColumnIndices().data(), - m_gpuMatrix.N(), - m_gpuMatrix.blockSize(), - m_diagInvFlattened.data()); + detail::invertDiagonalAndFlatten(m_gpuMatrix.getNonZeroValues().data(), + m_gpuMatrix.getRowIndices().data(), + m_gpuMatrix.getColumnIndices().data(), + m_gpuMatrix.N(), + m_diagInvFlattened.data()); } template @@ -90,12 +89,8 @@ CuJac::apply(X& v, const Y& d) // Compute the MV product where the matrix is diagonal and therefore stored as a vector. // The product is thus computed as a hadamard product. - detail::weightedDiagMV(m_diagInvFlattened.data(), - m_gpuMatrix.N(), - m_gpuMatrix.blockSize(), - m_relaxationFactor, - d.data(), - v.data()); + detail::weightedDiagMV( + m_diagInvFlattened.data(), m_gpuMatrix.N(), m_gpuMatrix.blockSize(), m_relaxationFactor, d.data(), v.data()); } template @@ -116,12 +111,11 @@ void CuJac::update() { m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); - detail::invertDiagonalAndFlatten(m_gpuMatrix.getNonZeroValues().data(), - m_gpuMatrix.getRowIndices().data(), - m_gpuMatrix.getColumnIndices().data(), - m_gpuMatrix.N(), - m_gpuMatrix.blockSize(), - m_diagInvFlattened.data()); + detail::invertDiagonalAndFlatten(m_gpuMatrix.getNonZeroValues().data(), + m_gpuMatrix.getRowIndices().data(), + m_gpuMatrix.getColumnIndices().data(), + m_gpuMatrix.N(), + m_diagInvFlattened.data()); } } // namespace Opm::cuistl diff --git a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu index c31172204d0..27293a815b5 100644 --- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu +++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu @@ -21,44 +21,159 @@ #include namespace Opm::cuistl::detail { - namespace -{ - // TODO: combine the three following kernels into one using template arguments so that - // no compile time optimization is lost while making the code more compact - template - __global__ void cuInvertDiagonalAndFlattenBlocksize1( - T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec) +{ + + // TODO: figure out if this can be generalized effectively, this seems excessively verbose + // explicit formulas based on Dune cpu code + template + __device__ __forceinline__ void invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock) { - const auto row = blockDim.x * blockIdx.x + threadIdx.x; - const int blocksize = 1; + if (blocksize == 1) { + dstBlock[0] = 1.0 / (srcBlock[0]); + } else if (blocksize == 2) { + T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]); + dstBlock[0] = srcBlock[3] * detInv; + dstBlock[1] = -srcBlock[1] * detInv; + dstBlock[2] = -srcBlock[2] * detInv; + dstBlock[3] = srcBlock[0] * detInv; + } else if (blocksize == 3) { + // based on Dune implementation + T t4 = srcBlock[0] * srcBlock[4]; + T t6 = srcBlock[0] * srcBlock[5]; + T t8 = srcBlock[1] * srcBlock[3]; + T t10 = srcBlock[2] * srcBlock[3]; + T t12 = srcBlock[1] * srcBlock[6]; + T t14 = srcBlock[2] * srcBlock[6]; - if (row < numberOfRows) { - size_t nnzIdx = rowIndices[row]; - size_t nnzIdxLim = rowIndices[row + 1]; + T t17 = 1.0 + / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5] + - t14 * srcBlock[4]); // t17 is 1/determinant - // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a - // zero element - while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) { - ++nnzIdx; + dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17; + dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17; + dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17; + dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17; + dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17; + dstBlock[5] = -(t6 - t10) * t17; + dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17; + dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17; + dstBlock[8] = (t4 - t8) * t17; + } + } + + // explicit formulas based on Dune cpu code + template + __device__ __forceinline__ void invBlockInPlace(T* __restrict__ block) + { + if (blocksize == 1) { + block[0] = 1.0 / (block[0]); + } else if (blocksize == 2) { + T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]); + + T temp = block[0]; + block[0] = block[3] * detInv; + block[1] = -block[1] * detInv; + block[2] = -block[2] * detInv; + block[3] = temp * detInv; + } else if (blocksize == 3) { + T t4 = block[0] * block[4]; + T t6 = block[0] * block[5]; + T t8 = block[1] * block[3]; + T t10 = block[2] * block[3]; + T t12 = block[1] * block[6]; + T t14 = block[2] * block[6]; + + T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]); + T t17 = T(1.0) / det; + + T matrix01 = block[1]; + T matrix00 = block[0]; + T matrix10 = block[3]; + T matrix11 = block[4]; + + block[0] = (block[4] * block[8] - block[5] * block[7]) * t17; + block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17; + block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17; + block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17; + block[4] = (matrix00 * block[8] - t14) * t17; + block[5] = -(t6 - t10) * t17; + block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17; + block[7] = -(matrix00 * block[7] - t12) * t17; + block[8] = (t4 - t8) * t17; + } + } + + enum class MVType { SET, PLUS, MINUS }; + // SET: c = A*b + // PLS: c += A*b + // MINUS: c -= A*b + template + __device__ __forceinline__ void matrixVectorProductWithAction(const T* A, const T* b, T* c) + { + for (int i = 0; i < blocksize; ++i) { + if (OP == MVType::SET) { + c[i] = 0; } - // diagBlock points to the start of where the diagonal block is stored - T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx]; - // vecBlock points to the start of the block element in the vector where the inverse of the diagonal block - // element should be stored - T* vecBlock = &vec[blocksize * blocksize * row]; + for (int j = 0; j < blocksize; ++j) { + if (OP == MVType::SET || OP == MVType::PLUS) { + c[i] += A[i * blocksize + j] * b[j]; + } else if (OP == MVType::MINUS) { + c[i] -= A[i * blocksize + j] * b[j]; + } + } + } + } + + template + __device__ __forceinline__ void mv(const T* a, const T* b, T* c) + { + matrixVectorProductWithAction(a, b, c); + } + + template + __device__ __forceinline__ void umv(const T* a, const T* b, T* c) + { + matrixVectorProductWithAction(a, b, c); + } + + template + __device__ __forceinline__ void mmv(const T* a, const T* b, T* c) + { + matrixVectorProductWithAction(a, b, c); + } + + // dst -= A*B*C + template + __device__ __forceinline__ void mmx2Subtraction(T* A, T* B, T* C, T* dst) + { - vecBlock[0] = 1.0 / (diagBlock[0]); + T tmp[blocksize * blocksize] = {0}; + // tmp = A*B + for (int i = 0; i < blocksize; ++i) { + for (int k = 0; k < blocksize; ++k) { + for (int j = 0; j < blocksize; ++j) { + tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j]; + } + } + } + + // dst = tmp*C + for (int i = 0; i < blocksize; ++i) { + for (int k = 0; k < blocksize; ++k) { + for (int j = 0; j < blocksize; ++j) { + dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j]; + } + } } } - template - __global__ void cuInvertDiagonalAndFlattenBlocksize2( - T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec) + template + __global__ void + cuInvertDiagonalAndFlatten(T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec) { const auto row = blockDim.x * blockIdx.x + threadIdx.x; - const int blocksize = 2; if (row < numberOfRows) { size_t nnzIdx = rowIndices[row]; @@ -76,58 +191,153 @@ namespace // element should be stored T* vecBlock = &vec[blocksize * blocksize * row]; - // based on Dune implementation - T detInv = 1.0 / (diagBlock[0] * diagBlock[3] - diagBlock[1] * diagBlock[2]); - vecBlock[0] = diagBlock[3] * detInv; - vecBlock[1] = -diagBlock[1] * detInv; - vecBlock[2] = -diagBlock[2] * detInv; - vecBlock[3] = diagBlock[0] * detInv; + invBlockOutOfPlace(diagBlock, vecBlock); } } - template - __global__ void cuInvertDiagonalAndFlattenBlocksize3( - T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec) + + template + __global__ void cuComputeLowerSolveLevelSet(T* mat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + const T* d, + T* v) { - const auto row = blockDim.x * blockIdx.x + threadIdx.x; - const int blocksize = 3; + const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); + if (reorderedRowIdx < rowsInLevelSet + startIdx) { - if (row < numberOfRows) { - size_t nnzIdx = rowIndices[row]; - size_t nnzIdxLim = rowIndices[row + 1]; + const size_t nnzIdx = rowIndices[reorderedRowIdx]; + const int naturalRowIdx = indexConversion[reorderedRowIdx]; - // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a - // zero element - while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) { - ++nnzIdx; + T rhs[blocksize]; + for (int i = 0; i < blocksize; i++) { + rhs[i] = d[naturalRowIdx * blocksize + i]; } - // diagBlock points to the start of where the diagonal block is stored - T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx]; - // vecBlock points to the start of the block element in the vector where the inverse of the diagonal block - // element should be stored - T* vecBlock = &vec[blocksize * blocksize * row]; + for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) { + const int col = colIndices[block]; + mmv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + } - // based on Dune implementation - T t4 = diagBlock[0] * diagBlock[4]; - T t6 = diagBlock[0] * diagBlock[5]; - T t8 = diagBlock[1] * diagBlock[3]; - T t10 = diagBlock[2] * diagBlock[3]; - T t12 = diagBlock[1] * diagBlock[6]; - T t14 = diagBlock[2] * diagBlock[6]; + mv(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + } + } - T t17 = 1.0 - / (t4 * diagBlock[8] - t6 * diagBlock[7] - t8 * diagBlock[8] + t10 * diagBlock[7] + t12 * diagBlock[5] - - t14 * diagBlock[4]); // t17 is 1/determinant - - vecBlock[0] = (diagBlock[4] * diagBlock[8] - diagBlock[5] * diagBlock[7]) * t17; - vecBlock[1] = -(diagBlock[1] * diagBlock[8] - diagBlock[2] * diagBlock[7]) * t17; - vecBlock[2] = (diagBlock[1] * diagBlock[5] - diagBlock[2] * diagBlock[4]) * t17; - vecBlock[3] = -(diagBlock[3] * diagBlock[8] - diagBlock[5] * diagBlock[6]) * t17; - vecBlock[4] = (diagBlock[0] * diagBlock[8] - t14) * t17; - vecBlock[5] = -(t6 - t10) * t17; - vecBlock[6] = (diagBlock[3] * diagBlock[7] - diagBlock[4] * diagBlock[6]) * t17; - vecBlock[7] = -(diagBlock[0] * diagBlock[7] - t12) * t17; - vecBlock[8] = (t4 - t8) * t17; + template + __global__ void cuComputeUpperSolveLevelSet(T* mat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + T* v) + { + const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); + if (reorderedRowIdx < rowsInLevelSet + startIdx) { + const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1]; + const int naturalRowIdx = indexConversion[reorderedRowIdx]; + + T rhs[blocksize] = {0}; + + for (int block = nnzIdxLim - 1; colIndices[block] > naturalRowIdx; --block) { + const int col = colIndices[block]; + umv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + } + + mmv(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + } + } + + template + __global__ void cuComputeDiluDiagonal(T* mat, + int* rowIndices, + int* colIndices, + int* reorderedToNatural, + int* naturalToReordered, + const int startIdx, + int rowsInLevelSet, + T* dInv) + { + const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x; + if (reorderedRowIdx < rowsInLevelSet + startIdx) { + const int naturalRowIdx = reorderedToNatural[reorderedRowIdx]; + const size_t nnzIdx = rowIndices[reorderedRowIdx]; + + int diagIdx = nnzIdx; + while (colIndices[diagIdx] != naturalRowIdx) { + ++diagIdx; + } + + T dInvTmp[blocksize * blocksize]; + for (int i = 0; i < blocksize; ++i) { + for (int j = 0; j < blocksize; ++j) { + dInvTmp[i * blocksize + j] = mat[diagIdx * blocksize * blocksize + i * blocksize + j]; + } + } + + for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) { + const int col = naturalToReordered[colIndices[block]]; + // find element with indices swapped + // Binary search over block in the right row, [rowIndices[col], rowindices[col+1]-1] defines the range + // we binary search over + int left = rowIndices[col]; + int right = rowIndices[col + 1] - 1; + int mid; + + while (left <= right) { + mid = left + (right - left) / 2; // overflow-safe average + const int col = colIndices[mid]; + + if (col == naturalRowIdx) { + break; + } else if (col < naturalRowIdx) { + left = mid + 1; + } else { + right = mid - 1; + } + } + + const int symOpposite = mid; + + mmx2Subtraction(&mat[block * blocksize * blocksize], + &dInv[col * blocksize * blocksize], + &mat[symOpposite * blocksize * blocksize], + dInvTmp); + } + + invBlockInPlace(dInvTmp); + + for (int i = 0; i < blocksize; ++i) { + for (int j = 0; j < blocksize; ++j) { + dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j]; + } + } + } + } + + template + __global__ void cuMoveDataToReordered( + T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* indexConversion, size_t numberOfRows) + { + const auto srcRow = blockDim.x * blockIdx.x + threadIdx.x; + if (srcRow < numberOfRows) { + + const auto dstRow = indexConversion[srcRow]; + + for (int srcBlock = srcRowIndices[srcRow], dstBlock = dstRowIndices[dstRow]; + srcBlock < srcRowIndices[srcRow + 1]; + ++srcBlock, ++dstBlock) { + for (int i = 0; i < blocksize; ++i) { + for (int j = 0; j < blocksize; ++j) { + dstMatrix[dstBlock * blocksize * blocksize + i * blocksize + j] + = srcMatrix[srcBlock * blocksize * blocksize + i * blocksize + j]; + } + } + } } } @@ -143,32 +353,103 @@ namespace } } // namespace -template +template +void +invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec) +{ + if (blocksize <= 3) { + cuInvertDiagonalAndFlatten + <<>>(mat, rowIndices, colIndices, numberOfRows, vec); + } else { + OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); + } +} + +// perform the lower solve for all rows in the same level set +template +void +computeLowerSolveLevelSet(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + const T* d, + T* v) +{ + cuComputeLowerSolveLevelSet<<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); +} + +// perform the upper solve for all rows in the same level set +template +void +computeUpperSolveLevelSet(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + T* v) +{ + cuComputeUpperSolveLevelSet<<>>( + reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); +} + +template void -invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, size_t blocksize, T* vec) +computeDiluDiagonal(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* reorderedToNatural, + int* naturalToReordered, + const int startIdx, + int rowsInLevelSet, + T* dInv) { - switch (blocksize) { - case 1: - cuInvertDiagonalAndFlattenBlocksize1<<>>( - mat, rowIndices, colIndices, numberOfRows, vec); - break; - case 2: - cuInvertDiagonalAndFlattenBlocksize2<<>>( - mat, rowIndices, colIndices, numberOfRows, vec); - break; - case 3: - cuInvertDiagonalAndFlattenBlocksize3<<>>( - mat, rowIndices, colIndices, numberOfRows, vec); - break; - default: - // TODO: Figure out what is why it did not produce an error or any output in the output stream or the DBG file - // when I forced this case to execute + if (blocksize <= 3) { + cuComputeDiluDiagonal + <<>>(reorderedMat, + rowIndices, + colIndices, + reorderedToNatural, + naturalToReordered, + startIdx, + rowsInLevelSet, + dInv); + } else { OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); - break; } } -template void invertDiagonalAndFlatten(double*, int*, int*, size_t, size_t, double*); -template void invertDiagonalAndFlatten(float*, int*, int*, size_t, size_t, float*); +template +void +copyMatDataToReordered( + T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows) +{ + cuMoveDataToReordered<<>>( + srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows); +} + +#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ + template void invertDiagonalAndFlatten(T*, int*, int*, size_t, T*); \ + template void copyMatDataToReordered(T*, int*, T*, int*, int*, size_t); \ + template void computeDiluDiagonal(T*, int*, int*, int*, int*, const int, int, T*); \ + template void computeUpperSolveLevelSet(T*, int*, int*, int*, int, int, const T*, T*); \ + template void computeLowerSolveLevelSet(T*, int*, int*, int*, int, int, const T*, const T*, T*); +INSTANTIATE_KERNEL_WRAPPERS(float, 1); +INSTANTIATE_KERNEL_WRAPPERS(float, 2); +INSTANTIATE_KERNEL_WRAPPERS(float, 3); +INSTANTIATE_KERNEL_WRAPPERS(float, 4); +INSTANTIATE_KERNEL_WRAPPERS(float, 5); +INSTANTIATE_KERNEL_WRAPPERS(float, 6); +INSTANTIATE_KERNEL_WRAPPERS(double, 1); +INSTANTIATE_KERNEL_WRAPPERS(double, 2); +INSTANTIATE_KERNEL_WRAPPERS(double, 3); +INSTANTIATE_KERNEL_WRAPPERS(double, 4); +INSTANTIATE_KERNEL_WRAPPERS(double, 5); +INSTANTIATE_KERNEL_WRAPPERS(double, 6); } // namespace Opm::cuistl::detail diff --git a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp index 59fd45d68b5..5edbc938208 100644 --- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp +++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp @@ -29,11 +29,101 @@ namespace Opm::cuistl::detail * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format * @param numberOfRows Integer describing the number of rows in the matrix - * @param blocksize Integer describing the sidelength of a block in the sparse matrix * @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored */ -template -void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, size_t blocksize, T* vec); +template +void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec); +/** + * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel + * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such + * that rows in the same level sets are contiguous + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index + * in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as + * reorderedMat + * @param d Stores the defect + * @param [out] v Will store the results of the lower solve + */ +template +void computeLowerSolveLevelSet(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + const T* d, + T* v); + +/** + * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel + * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such + * that rows in the same level sets are contiguous + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index + * in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner + * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower + * solve + */ +template +void computeUpperSolveLevelSet(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* indexConversion, + int startIdx, + int rowsInLevelSet, + const T* dInv, + T* v); + +/** + * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector + * containing the diagonal blocks + * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such + * that rows in the same level sets are contiguous + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding + * index in the natural ordered matrix + * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding + * index in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this + * function + * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner + */ +template +void computeDiluDiagonal(T* reorderedMat, + int* rowIndices, + int* colIndices, + int* reorderedToNatural, + int* naturalToReordered, + int startIdx, + int rowsInLevelSet, + T* dInv); + +/** + * @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list + * @param srcMatrix The source matrix we will copy data from + * @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format + * @param [out] dstMatrix The destination matrix that we copy data to + * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination matrix compliant wiht bsr + * format + * @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix + * @param numberOfRows The number of rows in the matrices + */ +template +void copyMatDataToReordered( + T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows); } // namespace Opm::cuistl::detail #endif diff --git a/tests/cuistl/test_cuSparse_matrix_operations.cpp b/tests/cuistl/test_cuSparse_matrix_operations.cpp index eb7fd39178f..d389a298fd0 100644 --- a/tests/cuistl/test_cuSparse_matrix_operations.cpp +++ b/tests/cuistl/test_cuSparse_matrix_operations.cpp @@ -87,12 +87,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith3By3Blocks, T, Numeric Opm::cuistl::CuSparseMatrix m = Opm::cuistl::CuSparseMatrix::fromMatrix(B); Opm::cuistl::CuVector dInvDiag(blocksize * blocksize * N); - Opm::cuistl::detail::invertDiagonalAndFlatten(m.getNonZeroValues().data(), - m.getRowIndices().data(), - m.getColumnIndices().data(), - N, - blocksize, - dInvDiag.data()); + Opm::cuistl::detail::invertDiagonalAndFlatten( + m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data()); std::vector expectedInvDiag {-1.0 / 4.0, 1.0 / 4.0, @@ -165,12 +161,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric Opm::cuistl::CuSparseMatrix m = Opm::cuistl::CuSparseMatrix::fromMatrix(B); Opm::cuistl::CuVector dInvDiag(blocksize * blocksize * N); - Opm::cuistl::detail::invertDiagonalAndFlatten(m.getNonZeroValues().data(), - m.getRowIndices().data(), - m.getColumnIndices().data(), - N, - blocksize, - dInvDiag.data()); + Opm::cuistl::detail::invertDiagonalAndFlatten( + m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data()); std::vector expectedInvDiag {2.0, -2.0, -1.0 / 2.0, 1.0, -1.0, 0.0, 0.0, -1.0}; std::vector computedInvDiag = dInvDiag.asStdVector(); @@ -179,4 +171,4 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric for (size_t i = 0; i < expectedInvDiag.size(); ++i) { BOOST_CHECK_CLOSE(expectedInvDiag[i], computedInvDiag[i], 1e-7); } -} +} \ No newline at end of file