diff --git a/opm/simulators/linalg/ParallelOverlappingILU0_impl.hpp b/opm/simulators/linalg/ParallelOverlappingILU0_impl.hpp index 56de7d08fd5..20c6f866333 100644 --- a/opm/simulators/linalg/ParallelOverlappingILU0_impl.hpp +++ b/opm/simulators/linalg/ParallelOverlappingILU0_impl.hpp @@ -289,7 +289,7 @@ template void ParallelOverlappingILU0:: apply (Domain& v, const Range& d) { - auto start_time = std::chrono::high_resolution_clock::now(); + // auto start_time = std::chrono::high_resolution_clock::now(); OPM_TIMEBLOCK(apply); Range& md = reorderD(d); @@ -347,10 +347,10 @@ apply (Domain& v, const Range& d) reorderBack(mv, v); - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end_time - start_time); + // auto end_time = std::chrono::high_resolution_clock::now(); + // auto duration = std::chrono::duration_cast(end_time - start_time); - std::cout << "Apply: " << duration.count() << " microseconds." << std::endl; + // std::cout << "Apply: " << duration.count() << " microseconds." << std::endl; } template @@ -367,7 +367,7 @@ template void ParallelOverlappingILU0:: update() { - auto start_time = std::chrono::high_resolution_clock::now(); + // auto start_time = std::chrono::high_resolution_clock::now(); OPM_TIMEBLOCK(update); // (For older DUNE versions the communicator might be @@ -535,10 +535,10 @@ update() // store ILU in simple CRS format detail::convertToCRS(*ILU_, lower_, upper_, inv_); - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end_time - start_time); + // auto end_time = std::chrono::high_resolution_clock::now(); + // auto duration = std::chrono::duration_cast(end_time - start_time); - std::cout << "Update: " << duration.count() << " microseconds." << std::endl; + // std::cout << "Update: " << duration.count() << " microseconds." << std::endl; } template diff --git a/opm/simulators/linalg/cuistl/CuDILU.cpp b/opm/simulators/linalg/cuistl/CuDILU.cpp index 3391bccacb3..365b90e56d1 100644 --- a/opm/simulators/linalg/cuistl/CuDILU.cpp +++ b/opm/simulators/linalg/cuistl/CuDILU.cpp @@ -113,60 +113,60 @@ printDuneBlockVec(V v, int blocksize) // TODO: I think sending size is excessive std::vector -createReorderedToNatural(int size, Opm::SparseTable level_sets) +createReorderedToNatural(int size, Opm::SparseTable levelSets) { auto res = std::vector(size); int globCnt = 0; - for (int i = 0; i < level_sets.size(); i++) { - for (size_t j = 0; j < level_sets[i].size(); j++) { - res[globCnt++] = (int)level_sets[i][j]; + for (int i = 0; i < levelSets.size(); i++) { + for (size_t j = 0; j < levelSets[i].size(); j++) { + res[globCnt++] = (int)levelSets[i][j]; } } return res; } std::vector -createNaturalToReordered(int size, Opm::SparseTable level_sets) +createNaturalToReordered(int size, Opm::SparseTable levelSets) { auto res = std::vector(size); int globCnt = 0; - for (int i = 0; i < level_sets.size(); i++) { - for (size_t j = 0; j < level_sets[i].size(); j++) { - res[level_sets[i][j]] = globCnt++; + for (int i = 0; i < levelSets.size(); i++) { + for (size_t j = 0; j < levelSets[i].size(); j++) { + res[levelSets[i][j]] = globCnt++; } } return res; } +// TODO: When this function is called we already have the natural ordered matrix on the GPU +// TODO: could it be possible to create the reordered one in a kernel to speed up the constructor? template Opm::cuistl::CuSparseMatrix -createReorderedMatrix(M naturalMatrix, std::vector reordered_to_natural) +createReorderedMatrix(M naturalMatrix, std::vector reorderedToNatural) { M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise); - for (auto dst_row_it = reorderedMatrix.createbegin(); dst_row_it != reorderedMatrix.createend(); ++dst_row_it) { - auto src_row = naturalMatrix.begin() + reordered_to_natural[dst_row_it.index()]; + for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) { + auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()]; // For elements in A - for (auto elem = src_row->begin(); elem != src_row->end(); elem++) { - dst_row_it.insert(elem.index()); + for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) { + dstRowIt.insert(elem.index()); } } - for (auto dst_row_it = reorderedMatrix.begin(); dst_row_it != reorderedMatrix.end(); ++dst_row_it) { - auto src_row = naturalMatrix.begin() + reordered_to_natural[dst_row_it.index()]; - for (auto elem = src_row->begin(); elem != src_row->end(); elem++) { - reorderedMatrix[dst_row_it.index()][elem.index()] = *elem; + // TODO: There is probably a faster way to copy by copying whole rows at a time + for (auto dstRowIt = reorderedMatrix.begin(); dstRowIt != reorderedMatrix.end(); ++dstRowIt) { + auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()]; + for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) { + reorderedMatrix[dstRowIt.index()][elem.index()] = *elem; } } return Opm::cuistl::CuSparseMatrix::fromMatrix(reorderedMatrix, true); } - -// TODO change variables from snake_case to camelCase namespace Opm::cuistl { -// TODO: optimize construction by omitting unused variable [m_cpuMatrixReordered] template CuDILU::CuDILU(const M& A, field_type w) : m_cpuMatrix(A) diff --git a/opm/simulators/linalg/cuistl/CuJac.cpp b/opm/simulators/linalg/cuistl/CuJac.cpp index 415dd6e9e0b..7c21925c1ca 100644 --- a/opm/simulators/linalg/cuistl/CuJac.cpp +++ b/opm/simulators/linalg/cuistl/CuJac.cpp @@ -74,7 +74,6 @@ CuJac::CuJac(const M& A, field_type w) m_gpuMatrix.N(), m_gpuMatrix.blockSize(), m_diagInvFlattened.data()); - printf("gpu code should be run"); } template diff --git a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp index 833f8786a6f..79aebe81394 100644 --- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp +++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp @@ -35,10 +35,21 @@ namespace Opm::cuistl::detail template void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, size_t blocksize, T* vec); -// TODO: document this version when it is stable +/** + * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel + * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such that rows in the same level sets are contiguous + * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format + * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format + * @param numberOfRows Integer describing the number of rows in the matrix + * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index in the natural ordered matrix + * @param startIdx Index of the first row of the matrix to be solve + * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this function + * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner + * @param d Stores the defect + * @param [out] v Will store the results of the lower solve + */ template -void computeLowerSolveLevelSet(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, int* indexConversion, const int startIdx, int rowsInLevelSet, T* dInv, const T* d, T* v); - +void computeLowerSolveLevelSet(T* reorderedMat, int* rowIndices, int* colIndices, size_t numberOfRows, int* indexConversion, const int startIdx, int rowsInLevelSet, T* dInv, const T* d, T* v); // TODO: document this version when it is stable template