diff --git a/opm/simulators/linalg/ParallelOverlappingILU0_impl.hpp b/opm/simulators/linalg/ParallelOverlappingILU0_impl.hpp
index 56de7d08fd5..20c6f866333 100644
--- a/opm/simulators/linalg/ParallelOverlappingILU0_impl.hpp
+++ b/opm/simulators/linalg/ParallelOverlappingILU0_impl.hpp
@@ -289,7 +289,7 @@ template<class Matrix, class Domain, class Range, class ParallelInfoT>
 void ParallelOverlappingILU0<Matrix,Domain,Range,ParallelInfoT>::
 apply (Domain& v, const Range& d)
 {
-    auto start_time = std::chrono::high_resolution_clock::now();
+    // auto start_time = std::chrono::high_resolution_clock::now();
 
     OPM_TIMEBLOCK(apply);
     Range& md = reorderD(d);
@@ -347,10 +347,10 @@ apply (Domain& v, const Range& d)
     reorderBack(mv, v);
 
 
-    auto end_time = std::chrono::high_resolution_clock::now();
-    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
+    // auto end_time = std::chrono::high_resolution_clock::now();
+    // auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
 
-    std::cout << "Apply: " << duration.count() << " microseconds." << std::endl;
+    // std::cout << "Apply: " << duration.count() << " microseconds." << std::endl;
 }
 
 template<class Matrix, class Domain, class Range, class ParallelInfoT>
@@ -367,7 +367,7 @@ template<class Matrix, class Domain, class Range, class ParallelInfoT>
 void ParallelOverlappingILU0<Matrix,Domain,Range,ParallelInfoT>::
 update()
 {
-    auto start_time = std::chrono::high_resolution_clock::now();
+    // auto start_time = std::chrono::high_resolution_clock::now();
     
     OPM_TIMEBLOCK(update);
     // (For older DUNE versions the communicator might be
@@ -535,10 +535,10 @@ update()
     // store ILU in simple CRS format
     detail::convertToCRS(*ILU_, lower_, upper_, inv_);
 
-    auto end_time = std::chrono::high_resolution_clock::now();
-    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
+    // auto end_time = std::chrono::high_resolution_clock::now();
+    // auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
 
-    std::cout << "Update: " << duration.count() << " microseconds." << std::endl;
+    // std::cout << "Update: " << duration.count() << " microseconds." << std::endl;
 }
 
 template<class Matrix, class Domain, class Range, class ParallelInfoT>
diff --git a/opm/simulators/linalg/cuistl/CuDILU.cpp b/opm/simulators/linalg/cuistl/CuDILU.cpp
index 3391bccacb3..365b90e56d1 100644
--- a/opm/simulators/linalg/cuistl/CuDILU.cpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.cpp
@@ -113,60 +113,60 @@ printDuneBlockVec(V v, int blocksize)
 
 // TODO: I think sending size is excessive
 std::vector<int>
-createReorderedToNatural(int size, Opm::SparseTable<size_t> level_sets)
+createReorderedToNatural(int size, Opm::SparseTable<size_t> levelSets)
 {
     auto res = std::vector<int>(size);
     int globCnt = 0;
-    for (int i = 0; i < level_sets.size(); i++) {
-        for (size_t j = 0; j < level_sets[i].size(); j++) {
-            res[globCnt++] = (int)level_sets[i][j];
+    for (int i = 0; i < levelSets.size(); i++) {
+        for (size_t j = 0; j < levelSets[i].size(); j++) {
+            res[globCnt++] = (int)levelSets[i][j];
         }
     }
     return res;
 }
 
 std::vector<int>
-createNaturalToReordered(int size, Opm::SparseTable<size_t> level_sets)
+createNaturalToReordered(int size, Opm::SparseTable<size_t> levelSets)
 {
     auto res = std::vector<int>(size);
     int globCnt = 0;
-    for (int i = 0; i < level_sets.size(); i++) {
-        for (size_t j = 0; j < level_sets[i].size(); j++) {
-            res[level_sets[i][j]] = globCnt++;
+    for (int i = 0; i < levelSets.size(); i++) {
+        for (size_t j = 0; j < levelSets[i].size(); j++) {
+            res[levelSets[i][j]] = globCnt++;
         }
     }
     return res;
 }
 
+// TODO: When this function is called we already have the natural ordered matrix on the GPU
+// TODO: could it be possible to create the reordered one in a kernel to speed up the constructor?
 template <class M, class field_type>
 Opm::cuistl::CuSparseMatrix<field_type>
-createReorderedMatrix(M naturalMatrix, std::vector<int> reordered_to_natural)
+createReorderedMatrix(M naturalMatrix, std::vector<int> reorderedToNatural)
 {
     M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
-    for (auto dst_row_it = reorderedMatrix.createbegin(); dst_row_it != reorderedMatrix.createend(); ++dst_row_it) {
-        auto src_row = naturalMatrix.begin() + reordered_to_natural[dst_row_it.index()];
+    for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
+        auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
         // For elements in A
-        for (auto elem = src_row->begin(); elem != src_row->end(); elem++) {
-            dst_row_it.insert(elem.index());
+        for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
+            dstRowIt.insert(elem.index());
         }
     }
 
-    for (auto dst_row_it = reorderedMatrix.begin(); dst_row_it != reorderedMatrix.end(); ++dst_row_it) {
-        auto src_row = naturalMatrix.begin() + reordered_to_natural[dst_row_it.index()];
-        for (auto elem = src_row->begin(); elem != src_row->end(); elem++) {
-            reorderedMatrix[dst_row_it.index()][elem.index()] = *elem;
+    // TODO: There is probably a faster way to copy by copying whole rows at a time
+    for (auto dstRowIt = reorderedMatrix.begin(); dstRowIt != reorderedMatrix.end(); ++dstRowIt) {
+        auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
+        for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
+            reorderedMatrix[dstRowIt.index()][elem.index()] = *elem;
         }
     }
 
     return Opm::cuistl::CuSparseMatrix<field_type>::fromMatrix(reorderedMatrix, true);
 }
 
-
-// TODO change variables from snake_case to camelCase
 namespace Opm::cuistl
 {
 
-// TODO: optimize construction by omitting unused variable [m_cpuMatrixReordered]
 template <class M, class X, class Y, int l>
 CuDILU<M, X, Y, l>::CuDILU(const M& A, field_type w)
     : m_cpuMatrix(A)
diff --git a/opm/simulators/linalg/cuistl/CuJac.cpp b/opm/simulators/linalg/cuistl/CuJac.cpp
index 415dd6e9e0b..7c21925c1ca 100644
--- a/opm/simulators/linalg/cuistl/CuJac.cpp
+++ b/opm/simulators/linalg/cuistl/CuJac.cpp
@@ -74,7 +74,6 @@ CuJac<M, X, Y, l>::CuJac(const M& A, field_type w)
                                      m_gpuMatrix.N(),
                                      m_gpuMatrix.blockSize(),
                                      m_diagInvFlattened.data());
-    printf("gpu code should be run");
 }
 
 template <class M, class X, class Y, int l>
diff --git a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp
index 833f8786a6f..79aebe81394 100644
--- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp
+++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp
@@ -35,10 +35,21 @@ namespace Opm::cuistl::detail
 template <class T>
 void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, size_t blocksize, T* vec);
 
-// TODO: document this version when it is stable
+/**
+ * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
+ * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such that rows in the same level sets are contiguous
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param numberOfRows Integer describing the number of rows in the matrix
+ * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index in the natural ordered matrix
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this function
+ * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
+ * @param d Stores the defect
+ * @param [out] v Will store the results of the lower solve
+ */
 template <class T, int blocksize>
-void computeLowerSolveLevelSet(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, int* indexConversion, const int startIdx, int rowsInLevelSet, T* dInv, const T* d, T* v);
-
+void computeLowerSolveLevelSet(T* reorderedMat, int* rowIndices, int* colIndices, size_t numberOfRows, int* indexConversion, const int startIdx, int rowsInLevelSet, T* dInv, const T* d, T* v);
 
 // TODO: document this version when it is stable
 template <class T, int blocksize>