diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index e84705672b8..0630f91e849 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -20,7 +20,9 @@ set(UNIFIED_SOURCES solver/bicgstab_kernels.cpp solver/cg_kernels.cpp solver/cgs_kernels.cpp + solver/common_gmres_kernels.cpp solver/fcg_kernels.cpp + solver/gmres_kernels.cpp solver/ir_kernels.cpp ) list(TRANSFORM UNIFIED_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/unified/) diff --git a/common/cuda_hip/solver/cb_gmres_kernels.hpp.inc b/common/cuda_hip/solver/cb_gmres_kernels.hpp.inc index a12ea9157b0..9626731a7ba 100644 --- a/common/cuda_hip/solver/cb_gmres_kernels.hpp.inc +++ b/common/cuda_hip/solver/cb_gmres_kernels.hpp.inc @@ -50,7 +50,7 @@ __global__ __launch_bounds__(default_block_size) void zero_matrix_kernel( // Must be called with at least `num_rows * stride_krylov` threads in total. template -__global__ __launch_bounds__(block_size) void initialize_2_1_kernel( +__global__ __launch_bounds__(block_size) void restart_1_kernel( size_type num_rows, size_type num_rhs, size_type krylov_dim, Accessor3d krylov_bases, ValueType* __restrict__ residual_norm_collection, size_type stride_residual_nc) @@ -82,7 +82,7 @@ __global__ __launch_bounds__(block_size) void initialize_2_1_kernel( // Must be called with at least `num_rows * num_rhs` threads in total. template -__global__ __launch_bounds__(block_size) void initialize_2_2_kernel( +__global__ __launch_bounds__(block_size) void restart_2_kernel( size_type num_rows, size_type num_rhs, const ValueType* __restrict__ residual, size_type stride_residual, const remove_complex* __restrict__ residual_norm, diff --git a/common/cuda_hip/solver/common_gmres_kernels.hpp.inc b/common/cuda_hip/solver/common_gmres_kernels.hpp.inc index 3be1c712cd0..74ccb634e2d 100644 --- a/common/cuda_hip/solver/common_gmres_kernels.hpp.inc +++ b/common/cuda_hip/solver/common_gmres_kernels.hpp.inc @@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Must be called with at least `max(stride_b * num_rows, krylov_dim * // num_cols)` threads in total. template -__global__ __launch_bounds__(block_size) void initialize_1_kernel( +__global__ __launch_bounds__(block_size) void initialize_kernel( size_type num_rows, size_type num_cols, size_type krylov_dim, const ValueType* __restrict__ b, size_type stride_b, ValueType* __restrict__ residual, size_type stride_residual, diff --git a/common/cuda_hip/solver/gmres_kernels.hpp.inc b/common/cuda_hip/solver/gmres_kernels.hpp.inc deleted file mode 100644 index 7691bbc5885..00000000000 --- a/common/cuda_hip/solver/gmres_kernels.hpp.inc +++ /dev/null @@ -1,240 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include "common_gmres_kernels.hpp.inc" - - -// Must be called with at least `num_rows * num_rhs` threads in total. -template -__global__ __launch_bounds__(block_size) void initialize_2_2_kernel( - size_type num_rows, size_type num_rhs, - const ValueType* __restrict__ residual, size_type stride_residual, - const remove_complex* __restrict__ residual_norm, - ValueType* __restrict__ residual_norm_collection, - ValueType* __restrict__ krylov_bases, size_type stride_krylov, - size_type* __restrict__ final_iter_nums) -{ - const auto global_id = thread::get_thread_id_flat(); - const auto row_idx = global_id / num_rhs; - const auto col_idx = global_id % num_rhs; - - if (global_id < num_rhs) { - residual_norm_collection[global_id] = residual_norm[global_id]; - final_iter_nums[global_id] = 0; - } - - if (row_idx < num_rows && col_idx < num_rhs) { - auto value = residual[row_idx * stride_residual + col_idx] / - residual_norm[col_idx]; - krylov_bases[row_idx * stride_krylov + col_idx] = value; - } -} - - -__global__ - __launch_bounds__(default_block_size) void increase_final_iteration_numbers_kernel( - size_type* __restrict__ final_iter_nums, - const stopping_status* __restrict__ stop_status, size_type total_number) -{ - const auto global_id = thread::get_thread_id_flat(); - if (global_id < total_number) { - final_iter_nums[global_id] += !stop_status[global_id].has_stopped(); - } -} - - -template -__global__ __launch_bounds__(default_dot_size) void multidot_kernel( - size_type k, size_type num_rows, size_type num_cols, - const ValueType* __restrict__ krylov_bases, - const ValueType* __restrict__ next_krylov_basis, size_type stride_krylov, - ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status) -{ - const auto tidx = threadIdx.x; - const auto tidy = threadIdx.y; - const auto col_idx = blockIdx.x * default_dot_dim + tidx; - const auto num = ceildiv(num_rows, gridDim.y); - const auto start_row = blockIdx.y * num; - const auto end_row = - ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num; - // Used that way to get around dynamic initialization warning and - // template error when using `reduction_helper_array` directly in `reduce` - __shared__ - uninitialized_array - reduction_helper_array; - ValueType* __restrict__ reduction_helper = reduction_helper_array; - - ValueType local_res = zero(); - if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { - for (size_type i = start_row + tidy; i < end_row; - i += default_dot_dim) { - const auto krylov_idx = i * stride_krylov + col_idx; - local_res += - conj(krylov_bases[krylov_idx]) * next_krylov_basis[krylov_idx]; - } - } - reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res; - __syncthreads(); - local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; - const auto tile_block = - group::tiled_partition(group::this_thread_block()); - const auto sum = - reduce(tile_block, local_res, - [](const ValueType& a, const ValueType& b) { return a + b; }); - const auto new_col_idx = blockIdx.x * default_dot_dim + tidy; - if (tidx == 0 && new_col_idx < num_cols && - !stop_status[new_col_idx].has_stopped()) { - const auto hessenberg_idx = k * stride_hessenberg + new_col_idx; - atomic_add(hessenberg_iter + hessenberg_idx, sum); - } -} - - -// Must be called with at least `num_rows * stride_next_krylov` threads in -// total. -template -__global__ __launch_bounds__(block_size) void update_next_krylov_kernel( - size_type k, size_type num_rows, size_type num_cols, - const ValueType* __restrict__ krylov_bases, - ValueType* __restrict__ next_krylov_basis, size_type stride_krylov, - const ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - const auto row_idx = global_id / stride_krylov; - const auto col_idx = global_id % stride_krylov; - - if (row_idx < num_rows && col_idx < num_cols && - !stop_status[col_idx].has_stopped()) { - const auto next_krylov_idx = row_idx * stride_krylov + col_idx; - const auto krylov_idx = row_idx * stride_krylov + col_idx; - const auto hessenberg_idx = k * stride_hessenberg + col_idx; - - next_krylov_basis[next_krylov_idx] -= - hessenberg_iter[hessenberg_idx] * krylov_bases[krylov_idx]; - } -} - - -// Must be called with at least `num_cols` blocks, each with `block_size` -// threads. `block_size` must be a power of 2. -template -__global__ __launch_bounds__(block_size) void update_hessenberg_2_kernel( - size_type iter, size_type num_rows, size_type num_cols, - const ValueType* __restrict__ next_krylov_basis, - size_type stride_next_krylov, ValueType* __restrict__ hessenberg_iter, - size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status) -{ - const auto tidx = threadIdx.x; - const auto col_idx = blockIdx.x; - - // Used that way to get around dynamic initialization warning and - // template error when using `reduction_helper_array` directly in `reduce` - __shared__ uninitialized_array - reduction_helper_array; - ValueType* __restrict__ reduction_helper = reduction_helper_array; - - if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { - ValueType local_res{}; - for (size_type i = tidx; i < num_rows; i += block_size) { - const auto next_krylov_idx = i * stride_next_krylov + col_idx; - const auto next_krylov_value = next_krylov_basis[next_krylov_idx]; - - local_res += next_krylov_value * next_krylov_value; - } - - reduction_helper[tidx] = local_res; - - // Perform thread block reduction. Result is in reduction_helper[0] - reduce(group::this_thread_block(), reduction_helper, - [](const ValueType& a, const ValueType& b) { return a + b; }); - - if (tidx == 0) { - hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] = - sqrt(reduction_helper[0]); - } - } -} - - -// Must be called with at least `num_rows * stride_krylov` threads in -// total. -template -__global__ __launch_bounds__(block_size) void update_krylov_kernel( - size_type iter, size_type num_rows, size_type num_cols, - ValueType* __restrict__ krylov_bases, size_type stride_krylov, - const ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - const auto row_idx = global_id / stride_krylov; - const auto col_idx = global_id % stride_krylov; - const auto hessenberg = - hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx]; - - if (row_idx < num_rows && col_idx < num_cols && - !stop_status[col_idx].has_stopped()) { - const auto krylov_idx = row_idx * stride_krylov + col_idx; - - krylov_bases[krylov_idx] /= hessenberg; - } -} - - -// Must be called with at least `stride_preconditioner * num_rows` threads in -// total. -template -__global__ __launch_bounds__(block_size) void calculate_Qy_kernel( - size_type num_rows, size_type num_cols, size_type num_rhs, - const ValueType* __restrict__ krylov_bases, size_type stride_krylov, - const ValueType* __restrict__ y, size_type stride_y, - ValueType* __restrict__ before_preconditioner, - size_type stride_preconditioner, - const size_type* __restrict__ final_iter_nums) -{ - const auto global_id = thread::get_thread_id_flat(); - const auto row_id = global_id / stride_preconditioner; - const auto col_id = global_id % stride_preconditioner; - - if (row_id < num_rows && col_id < num_cols) { - ValueType temp = zero(); - - for (size_type j = 0; j < final_iter_nums[col_id]; ++j) { - temp += - krylov_bases[(row_id + j * num_rows) * stride_krylov + col_id] * - y[j * stride_y + col_id]; - } - before_preconditioner[global_id] = temp; - } -} diff --git a/common/unified/solver/common_gmres_kernels.cpp b/common/unified/solver/common_gmres_kernels.cpp new file mode 100644 index 00000000000..13b8ff96919 --- /dev/null +++ b/common/unified/solver/common_gmres_kernels.cpp @@ -0,0 +1,197 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/common_gmres_kernels.hpp" + + +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "core/solver/cb_gmres_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The common GMRES solver namespace. + * + * @ingroup gmres + */ +namespace common_gmres { + + +template +void initialize(std::shared_ptr exec, + const matrix::Dense* b, + matrix::Dense* residual, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + stopping_status* stop_status) +{ + const auto krylov_dim = givens_sin->get_size()[0]; + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto b, auto residual, auto givens_sin, + auto givens_cos, auto stop_status, auto krylov_dim, + auto num_rows) { + using value_type = std::decay_t; + if (i == 0) { + stop_status[j].reset(); + } + if (i < num_rows) { + residual(i, j) = b(i, j); + } + if (i < krylov_dim) { + givens_sin(i, j) = zero(); + givens_cos(i, j) = zero(); + } + }, + dim<2>{std::max(b->get_size()[0], krylov_dim), b->get_size()[1]}, b, + residual, givens_sin, givens_cos, stop_status, krylov_dim, + b->get_size()[0]); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL); + + +template +void hessenberg_qr(std::shared_ptr exec, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + matrix::Dense* hessenberg_iter, size_type iter, + size_type* final_iter_nums, + const stopping_status* stop_status) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto rhs, auto givens_sin, auto givens_cos, + auto residual_norm, auto residual_norm_collection, + auto hessenberg_iter, auto iter, auto final_iter_nums, + auto stop_status) { + using value_type = std::decay_t; + if (stop_status[rhs].has_stopped()) { + return; + } + // increment iteration count + final_iter_nums[rhs]++; + auto hess_this = hessenberg_iter(0, rhs); + auto hess_next = hessenberg_iter(1, rhs); + // apply previous Givens rotations to column + for (decltype(iter) j = 0; j < iter; ++j) { + // in here: hess_this = hessenberg_iter(j, rhs); + // hess_next = hessenberg_iter(j+1, rhs); + hess_next = hessenberg_iter(j + 1, rhs); + const auto gc = givens_cos(j, rhs); + const auto gs = givens_sin(j, rhs); + const auto out1 = gc * hess_this + gs * hess_next; + const auto out2 = -conj(gs) * hess_this + conj(gc) * hess_next; + hessenberg_iter(j, rhs) = out1; + hessenberg_iter(j + 1, rhs) = hess_this = out2; + hess_next = hessenberg_iter(j + 2, rhs); + } + // hess_this is hessenberg_iter(iter, rhs) and + // hess_next is hessenberg_iter(iter + 1, rhs) + auto gs = givens_sin(iter, rhs); + auto gc = givens_cos(iter, rhs); + // compute new Givens rotation + if (hess_this == zero()) { + givens_cos(iter, rhs) = gc = zero(); + givens_sin(iter, rhs) = gs = one(); + } else { + const auto scale = abs(hess_this) + abs(hess_next); + const auto hypotenuse = + scale * + sqrt(abs(hess_this / scale) * abs(hess_this / scale) + + abs(hess_next / scale) * abs(hess_next / scale)); + givens_cos(iter, rhs) = gc = conj(hess_this) / hypotenuse; + givens_sin(iter, rhs) = gs = conj(hess_next) / hypotenuse; + } + // apply new Givens rotation to column + hessenberg_iter(iter, rhs) = gc * hess_this + gs * hess_next; + hessenberg_iter(iter + 1, rhs) = zero(); + // apply new Givens rotation to RHS of least-squares problem + const auto rnc_new = + -conj(gs) * residual_norm_collection(iter, rhs); + residual_norm_collection(iter + 1, rhs) = rnc_new; + residual_norm_collection(iter, rhs) = + gc * residual_norm_collection(iter, rhs); + residual_norm(0, rhs) = abs(rnc_new); + }, + hessenberg_iter->get_size()[1], givens_sin, givens_cos, residual_norm, + residual_norm_collection, hessenberg_iter, iter, final_iter_nums, + stop_status); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL); + + +template +void solve_krylov(std::shared_ptr exec, + const matrix::Dense* residual_norm_collection, + const matrix::Dense* hessenberg, + matrix::Dense* y, const size_type* final_iter_nums, + const stopping_status* stop_status) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto col, auto rhs, auto mtx, auto y, auto sizes, + auto stop, auto num_cols) { + if (stop[col].is_finalized()) { + return; + } + for (int64 i = sizes[col] - 1; i >= 0; i--) { + auto value = rhs(i, col); + for (int64 j = i + 1; j < sizes[col]; j++) { + value -= mtx(i, j * num_cols + col) * y(j, col); + } + // y(i) = (rhs(i) - U(i,i+1:) * y(i+1:)) / U(i, i) + y(i, col) = value / mtx(i, i * num_cols + col); + } + }, + residual_norm_collection->get_size()[1], residual_norm_collection, + hessenberg, y, final_iter_nums, stop_status, + residual_norm_collection->get_size()[1]); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL); + + +} // namespace common_gmres +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/solver/gmres_kernels.cpp b/common/unified/solver/gmres_kernels.cpp new file mode 100644 index 00000000000..aaf8d9140dd --- /dev/null +++ b/common/unified/solver/gmres_kernels.cpp @@ -0,0 +1,130 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/gmres_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The GMRES solver namespace. + * + * @ingroup gmres + */ +namespace gmres { + + +template +void restart(std::shared_ptr exec, + const matrix::Dense* residual, + const matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + matrix::Dense* krylov_bases, size_type* final_iter_nums) +{ + if (residual->get_size()[0] == 0) { + run_kernel( + exec, + [] GKO_KERNEL(auto j, auto residual_norm, + auto residual_norm_collection, auto final_iter_nums) { + residual_norm_collection(0, j) = residual_norm(0, j); + final_iter_nums[j] = 0; + }, + residual->get_size()[1], residual_norm, residual_norm_collection, + final_iter_nums); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto residual, auto residual_norm, + auto residual_norm_collection, auto krylov_bases, + auto final_iter_nums) { + if (i == 0) { + residual_norm_collection(0, j) = residual_norm(0, j); + final_iter_nums[j] = 0; + } + krylov_bases(i, j) = residual(i, j) / residual_norm(0, j); + }, + residual->get_size(), residual, residual_norm, + residual_norm_collection, krylov_bases, final_iter_nums); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL); + + +template +void multi_axpy(std::shared_ptr exec, + const matrix::Dense* krylov_bases, + const matrix::Dense* y, + matrix::Dense* before_preconditioner, + const size_type* final_iter_nums, stopping_status* stop_status) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto bases, auto y, auto out, + auto sizes, auto stop, auto num_rows) { + if (stop[col].is_finalized()) { + return; + } + auto value = zero(out(row, col)); + for (int i = 0; i < sizes[col]; i++) { + value += bases(row + i * num_rows, col) * y(i, col); + } + out(row, col) = value; + }, + before_preconditioner->get_size(), krylov_bases, y, + before_preconditioner, final_iter_nums, stop_status, + before_preconditioner->get_size()[0]); + run_kernel( + exec, + [] GKO_KERNEL(auto col, auto stop) { + if (!stop[col].is_finalized()) { + stop[col].finalize(); + } + }, + before_preconditioner->get_size()[1], stop_status); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL); + + +} // namespace gmres +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index fdaf02b050d..273ac44be5a 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -74,6 +74,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/solver/cb_gmres_kernels.hpp" #include "core/solver/cg_kernels.hpp" #include "core/solver/cgs_kernels.hpp" +#include "core/solver/common_gmres_kernels.hpp" #include "core/solver/fcg_kernels.hpp" #include "core/solver/gmres_kernels.hpp" #include "core/solver/idr_kernels.hpp" @@ -437,13 +438,22 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_CGS_STEP_3_KERNEL); } // namespace cgs +namespace common_gmres { + + +GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL); + + +} // namespace common_gmres + + namespace gmres { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_1_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_2_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL); } // namespace gmres @@ -452,10 +462,10 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_2_KERNEL); namespace cb_gmres { -GKO_STUB_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_1_KERNEL); -GKO_STUB_CB_GMRES(GKO_DECLARE_CB_GMRES_INITIALIZE_2_KERNEL); -GKO_STUB_CB_GMRES(GKO_DECLARE_CB_GMRES_STEP_1_KERNEL); -GKO_STUB_CB_GMRES_CONST(GKO_DECLARE_CB_GMRES_STEP_2_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_STUB_CB_GMRES(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); +GKO_STUB_CB_GMRES(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL); +GKO_STUB_CB_GMRES_CONST(GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL); } // namespace cb_gmres diff --git a/core/solver/cb_gmres.cpp b/core/solver/cb_gmres.cpp index 3d6d84738e1..f06aac3f9e1 100644 --- a/core/solver/cb_gmres.cpp +++ b/core/solver/cb_gmres.cpp @@ -58,10 +58,10 @@ namespace cb_gmres { namespace { -GKO_REGISTER_OPERATION(initialize_1, cb_gmres::initialize_1); -GKO_REGISTER_OPERATION(initialize_2, cb_gmres::initialize_2); -GKO_REGISTER_OPERATION(step_1, cb_gmres::step_1); -GKO_REGISTER_OPERATION(step_2, cb_gmres::step_2); +GKO_REGISTER_OPERATION(initialize, cb_gmres::initialize); +GKO_REGISTER_OPERATION(restart, cb_gmres::restart); +GKO_REGISTER_OPERATION(arnoldi, cb_gmres::arnoldi); +GKO_REGISTER_OPERATION(solve_krylov, cb_gmres::solve_krylov); } // anonymous namespace @@ -240,18 +240,14 @@ void CbGmres::apply_dense_impl( auto next_krylov_basis = Vector::create_with_config_of(dense_b); std::shared_ptr> preconditioned_vector = Vector::create_with_config_of(dense_b); - auto hessenberg = Vector::create( - exec, dim<2>{krylov_dim + 1, krylov_dim * dense_b->get_size()[1]}); - auto buffer = Vector::create( - exec, dim<2>{krylov_dim + 1, dense_b->get_size()[1]}); - auto givens_sin = - Vector::create(exec, dim<2>{krylov_dim, dense_b->get_size()[1]}); - auto givens_cos = - Vector::create(exec, dim<2>{krylov_dim, dense_b->get_size()[1]}); - auto residual_norm_collection = Vector::create( - exec, dim<2>{krylov_dim + 1, dense_b->get_size()[1]}); - auto residual_norm = - VectorNorms::create(exec, dim<2>{1, dense_b->get_size()[1]}); + auto hessenberg = + Vector::create(exec, dim<2>{krylov_dim + 1, krylov_dim * num_rhs}); + auto buffer = Vector::create(exec, dim<2>{krylov_dim + 1, num_rhs}); + auto givens_sin = Vector::create(exec, dim<2>{krylov_dim, num_rhs}); + auto givens_cos = Vector::create(exec, dim<2>{krylov_dim, num_rhs}); + auto residual_norm_collection = + Vector::create(exec, dim<2>{krylov_dim + 1, num_rhs}); + auto residual_norm = VectorNorms::create(exec, dim<2>{1, num_rhs}); // 1st row of arnoldi_norm: == eta * norm2(old_next_krylov_basis) // with eta == 1 / sqrt(2) // (computed right before updating @@ -260,37 +256,32 @@ void CbGmres::apply_dense_impl( // == norm2(next_krylov_basis) // 3rd row of arnoldi_norm: the infinity norm of next_krylov_basis // (ONLY when using a scalar accessor) - auto arnoldi_norm = - VectorNorms::create(exec, dim<2>{3, dense_b->get_size()[1]}); - array final_iter_nums(this->get_executor(), - dense_b->get_size()[1]); - auto y = - Vector::create(exec, dim<2>{krylov_dim, dense_b->get_size()[1]}); + auto arnoldi_norm = VectorNorms::create(exec, dim<2>{3, num_rhs}); + array final_iter_nums(this->get_executor(), num_rhs); + auto y = Vector::create(exec, dim<2>{krylov_dim, num_rhs}); bool one_changed{}; array reduction_tmp{this->get_executor()}; - array stop_status(this->get_executor(), - dense_b->get_size()[1]); + array stop_status(this->get_executor(), num_rhs); // reorth_status and num_reorth are both helper variables for GPU // implementations at the moment. // num_reorth := Number of vectors which require a re-orthogonalization // reorth_status := stopping status for the re-orthogonalization, // marking which RHS requires one, and which does not - array reorth_status(this->get_executor(), - dense_b->get_size()[1]); + array reorth_status(this->get_executor(), num_rhs); array num_reorth(this->get_executor(), 1); // Initialization - exec->run(cb_gmres::make_initialize_1( - dense_b, residual.get(), givens_sin.get(), givens_cos.get(), - &stop_status, krylov_dim)); + exec->run(cb_gmres::make_initialize(dense_b, residual.get(), + givens_sin.get(), givens_cos.get(), + &stop_status, krylov_dim)); // residual = dense_b // givens_sin = givens_cos = 0 this->get_system_matrix()->apply(neg_one_op.get(), dense_x, one_op.get(), residual.get()); // residual = residual - Ax - exec->run(cb_gmres::make_initialize_2( + exec->run(cb_gmres::make_restart( residual.get(), residual_norm.get(), residual_norm_collection.get(), arnoldi_norm.get(), krylov_bases_range, next_krylov_basis.get(), &final_iter_nums, reduction_tmp, krylov_dim)); @@ -313,10 +304,8 @@ void CbGmres::apply_dense_impl( auto after_preconditioner = matrix::Dense::create_with_config_of(dense_x); - array stop_encountered_rhs(exec->get_master(), - dense_b->get_size()[1]); - array fully_converged_rhs(exec->get_master(), - dense_b->get_size()[1]); + array stop_encountered_rhs(exec->get_master(), num_rhs); + array fully_converged_rhs(exec->get_master(), num_rhs); array host_stop_status( this->get_executor()->get_master(), stop_status); for (size_type i = 0; i < stop_encountered_rhs.get_num_elems(); ++i) { @@ -327,7 +316,7 @@ void CbGmres::apply_dense_impl( // convergence detection constexpr int start_force_reset{10}; bool perform_reset{false}; - // Fraction of the krylov_dim_ (or total_iter if it is lower), + // Fraction of the krylov_dim (or total_iter if it is lower), // determining the number of forced iteration to perform constexpr size_type forced_iteration_fraction{10}; const size_type forced_limit{krylov_dim / forced_iteration_fraction}; @@ -398,10 +387,9 @@ void CbGmres::apply_dense_impl( // Restart // use a view in case this is called earlier auto hessenberg_view = hessenberg->create_submatrix( - span{0, restart_iter}, - span{0, dense_b->get_size()[1] * (restart_iter)}); + span{0, restart_iter}, span{0, num_rhs * (restart_iter)}); - exec->run(cb_gmres::make_step_2( + exec->run(cb_gmres::make_solve_krylov( residual_norm_collection.get(), krylov_bases_range.get_accessor().to_const(), hessenberg_view.get(), y.get(), before_preconditioner.get(), @@ -419,7 +407,7 @@ void CbGmres::apply_dense_impl( this->get_system_matrix()->apply(neg_one_op.get(), dense_x, one_op.get(), residual.get()); // residual = residual - Ax - exec->run(cb_gmres::make_initialize_2( + exec->run(cb_gmres::make_restart( residual.get(), residual_norm.get(), residual_norm_collection.get(), arnoldi_norm.get(), krylov_bases_range, next_krylov_basis.get(), @@ -440,16 +428,15 @@ void CbGmres::apply_dense_impl( // Do Arnoldi and givens rotation auto hessenberg_iter = hessenberg->create_submatrix( span{0, restart_iter + 2}, - span{dense_b->get_size()[1] * restart_iter, - dense_b->get_size()[1] * (restart_iter + 1)}); + span{num_rhs * restart_iter, num_rhs * (restart_iter + 1)}); auto buffer_iter = buffer->create_submatrix( - span{0, restart_iter + 2}, span{0, dense_b->get_size()[1]}); + span{0, restart_iter + 2}, span{0, num_rhs}); // Start of arnoldi this->get_system_matrix()->apply(preconditioned_vector.get(), next_krylov_basis.get()); // next_krylov_basis = A * preconditioned_vector - exec->run(cb_gmres::make_step_1( + exec->run(cb_gmres::make_arnoldi( next_krylov_basis.get(), givens_sin.get(), givens_cos.get(), residual_norm.get(), residual_norm_collection.get(), krylov_bases_range, hessenberg_iter.get(), buffer_iter.get(), @@ -483,10 +470,9 @@ void CbGmres::apply_dense_impl( // Solve x auto hessenberg_small = hessenberg->create_submatrix( - span{0, restart_iter}, - span{0, dense_b->get_size()[1] * (restart_iter)}); + span{0, restart_iter}, span{0, num_rhs * restart_iter}); - exec->run(cb_gmres::make_step_2( + exec->run(cb_gmres::make_solve_krylov( residual_norm_collection.get(), krylov_bases_range.get_accessor().to_const(), hessenberg_small.get(), y.get(), before_preconditioner.get(), diff --git a/core/solver/cb_gmres_kernels.hpp b/core/solver/cb_gmres_kernels.hpp index 3779bd56a80..d2b0325e1fe 100644 --- a/core/solver/cb_gmres_kernels.hpp +++ b/core/solver/cb_gmres_kernels.hpp @@ -125,28 +125,28 @@ namespace gko { namespace kernels { -#define GKO_DECLARE_CB_GMRES_INITIALIZE_1_KERNEL(_type) \ - void initialize_1( \ +#define GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL(_type) \ + void initialize( \ std::shared_ptr exec, \ const matrix::Dense<_type>* b, matrix::Dense<_type>* residual, \ matrix::Dense<_type>* givens_sin, matrix::Dense<_type>* givens_cos, \ array* stop_status, size_type krylov_dim) -#define GKO_DECLARE_CB_GMRES_INITIALIZE_2_KERNEL(_type1, _range) \ - void initialize_2(std::shared_ptr exec, \ - const matrix::Dense<_type1>* residual, \ - matrix::Dense>* residual_norm, \ - matrix::Dense<_type1>* residual_norm_collection, \ - matrix::Dense>* arnoldi_norm, \ - _range krylov_bases, \ - matrix::Dense<_type1>* next_krylov_basis, \ - array* final_iter_nums, array& tmp, \ - size_type krylov_dim) +#define GKO_DECLARE_CB_GMRES_RESTART_KERNEL(_type1, _range) \ + void restart(std::shared_ptr exec, \ + const matrix::Dense<_type1>* residual, \ + matrix::Dense>* residual_norm, \ + matrix::Dense<_type1>* residual_norm_collection, \ + matrix::Dense>* arnoldi_norm, \ + _range krylov_bases, \ + matrix::Dense<_type1>* next_krylov_basis, \ + array* final_iter_nums, \ + array& reduction_tmp, size_type krylov_dim) -#define GKO_DECLARE_CB_GMRES_STEP_1_KERNEL(_type1, _range) \ - void step_1( \ +#define GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL(_type1, _range) \ + void arnoldi( \ std::shared_ptr exec, \ matrix::Dense<_type1>* next_krylov_basis, \ matrix::Dense<_type1>* givens_sin, matrix::Dense<_type1>* givens_cos, \ @@ -159,24 +159,25 @@ namespace kernels { const array* stop_status, \ array* reorth_status, array* num_reorth) -#define GKO_DECLARE_CB_GMRES_STEP_2_KERNEL(_type1, _range) \ - void step_2(std::shared_ptr exec, \ - const matrix::Dense<_type1>* residual_norm_collection, \ - _range krylov_bases, const matrix::Dense<_type1>* hessenberg, \ - matrix::Dense<_type1>* y, \ - matrix::Dense<_type1>* before_preconditioner, \ - const array* final_iter_nums) - - -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_CB_GMRES_INITIALIZE_1_KERNEL(ValueType); \ - template \ - GKO_DECLARE_CB_GMRES_INITIALIZE_2_KERNEL(ValueType, Accessor3d); \ - template \ - GKO_DECLARE_CB_GMRES_STEP_1_KERNEL(ValueType, Accessor3d); \ - template \ - GKO_DECLARE_CB_GMRES_STEP_2_KERNEL(ValueType, Accessor3d) +#define GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL(_type1, _range) \ + void solve_krylov(std::shared_ptr exec, \ + const matrix::Dense<_type1>* residual_norm_collection, \ + _range krylov_bases, \ + const matrix::Dense<_type1>* hessenberg, \ + matrix::Dense<_type1>* y, \ + matrix::Dense<_type1>* before_preconditioner, \ + const array* final_iter_nums) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_CB_GMRES_RESTART_KERNEL(ValueType, Accessor3d); \ + template \ + GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL(ValueType, Accessor3d); \ + template \ + GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL(ValueType, Accessor3d) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(cb_gmres, GKO_DECLARE_ALL_AS_TEMPLATES); diff --git a/core/solver/common_gmres_kernels.hpp b/core/solver/common_gmres_kernels.hpp new file mode 100644 index 00000000000..e6ada48e68b --- /dev/null +++ b/core/solver/common_gmres_kernels.hpp @@ -0,0 +1,101 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_SOLVER_COMMON_GMRES_KERNELS_HPP_ +#define GKO_CORE_SOLVER_COMMON_GMRES_KERNELS_HPP_ + + +#include +#include +#include +#include +#include + + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { +namespace common_gmres { + + +#define GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL(_type) \ + void initialize( \ + std::shared_ptr exec, \ + const matrix::Dense<_type>* b, matrix::Dense<_type>* residual, \ + matrix::Dense<_type>* givens_sin, matrix::Dense<_type>* givens_cos, \ + stopping_status* stop_status) + + +#define GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL(_type) \ + void hessenberg_qr( \ + std::shared_ptr exec, \ + matrix::Dense<_type>* givens_sin, matrix::Dense<_type>* givens_cos, \ + matrix::Dense>* residual_norm, \ + matrix::Dense<_type>* residual_norm_collection, \ + matrix::Dense<_type>* hessenberg_iter, size_type iter, \ + size_type* final_iter_nums, const stopping_status* stop_status) + + +#define GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL(_type1) \ + void solve_krylov( \ + std::shared_ptr exec, \ + const matrix::Dense<_type1>* residual_norm_collection, \ + const matrix::Dense<_type1>* hessenberg, matrix::Dense<_type1>* y, \ + const size_type* final_iter_nums, const stopping_status* stop_status) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL(ValueType); \ + template \ + GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL(ValueType) + + +} // namespace common_gmres + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(common_gmres, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_SOLVER_COMMON_GMRES_KERNELS_HPP_ diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index 24ef6798d30..ea8aaceed01 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/solver/common_gmres_kernels.hpp" #include "core/solver/gmres_kernels.hpp" #include "core/solver/solver_boilerplate.hpp" @@ -55,10 +56,11 @@ namespace gmres { namespace { -GKO_REGISTER_OPERATION(initialize_1, gmres::initialize_1); -GKO_REGISTER_OPERATION(initialize_2, gmres::initialize_2); -GKO_REGISTER_OPERATION(step_1, gmres::step_1); -GKO_REGISTER_OPERATION(step_2, gmres::step_2); +GKO_REGISTER_OPERATION(initialize, common_gmres::initialize); +GKO_REGISTER_OPERATION(restart, gmres::restart); +GKO_REGISTER_OPERATION(hessenberg_qr, common_gmres::hessenberg_qr); +GKO_REGISTER_OPERATION(solve_krylov, common_gmres::solve_krylov); +GKO_REGISTER_OPERATION(multi_axpy, gmres::multi_axpy); } // anonymous namespace @@ -107,6 +109,32 @@ void Gmres::apply_impl(const LinOp* b, LinOp* x) const } +template +struct help_compute_norm { + static void compute_next_krylov_norm_into_hessenberg( + const matrix::Dense* next_krylov, + matrix::Dense* hessenberg_norm_entry, + matrix::Dense>*, array& reduction_tmp) + { + next_krylov->compute_norm2(hessenberg_norm_entry, reduction_tmp); + }; +}; + +template +struct help_compute_norm::value>> { + static void compute_next_krylov_norm_into_hessenberg( + const matrix::Dense* next_krylov, + matrix::Dense* hessenberg_norm_entry, + matrix::Dense>* next_krylov_norm_tmp, + array& reduction_tmp) + { + next_krylov->compute_norm2(next_krylov_norm_tmp, reduction_tmp); + next_krylov_norm_tmp->make_complex(hessenberg_norm_entry); + }; +}; + + template void Gmres::apply_dense_impl(const matrix::Dense* dense_b, matrix::Dense* dense_x) const @@ -128,6 +156,7 @@ void Gmres::apply_dense_impl(const matrix::Dense* dense_b, auto krylov_bases = this->create_workspace_op_with_type_of( ws::krylov_bases, dense_b, dim<2>{num_rows * (krylov_dim + 1), num_rhs}); + // rows: rows of Hessenberg matrix, columns: block for each entry auto hessenberg = this->template create_workspace_op( ws::hessenberg, dim<2>{krylov_dim + 1, krylov_dim * num_rhs}); auto givens_sin = this->template create_workspace_op( @@ -140,6 +169,11 @@ void Gmres::apply_dense_impl(const matrix::Dense* dense_b, ws::residual_norm, dim<2>{1, num_rhs}); auto y = this->template create_workspace_op( ws::y, dim<2>{krylov_dim, num_rhs}); + // next_krylov_norm_tmp is only required for complex types to move real + // values into a complex matrix + auto next_krylov_norm_tmp = this->template create_workspace_op( + ws::next_krylov_norm_tmp, + dim<2>{1, is_complex_s::value ? num_rhs : 0}); GKO_SOLVER_VECTOR(before_preconditioner, dense_x); GKO_SOLVER_VECTOR(after_preconditioner, dense_x); @@ -152,19 +186,22 @@ void Gmres::apply_dense_impl(const matrix::Dense* dense_b, ws::final_iter_nums, num_rhs); // Initialization - exec->run(gmres::make_initialize_1(dense_b, residual, givens_sin, - givens_cos, &stop_status, krylov_dim)); // residual = dense_b // givens_sin = givens_cos = 0 - this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, residual); + // reset stop status + exec->run(gmres::make_initialize(dense_b, residual, givens_sin, givens_cos, + stop_status.get_data())); // residual = residual - Ax - exec->run(gmres::make_initialize_2( - residual, residual_norm, residual_norm_collection, krylov_bases, - &final_iter_nums, reduction_tmp, krylov_dim)); + this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, residual); + // residual_norm = norm(residual) + residual->compute_norm2(residual_norm, reduction_tmp); // residual_norm_collection = {residual_norm, unchanged} // krylov_bases(:, 1) = residual / residual_norm // final_iter_nums = {0, ..., 0} + exec->run(gmres::make_restart(residual, residual_norm, + residual_norm_collection, krylov_bases, + final_iter_nums.get_data())); auto stop_criterion = this->get_stop_criterion_factory()->generate( this->get_system_matrix(), @@ -201,19 +238,22 @@ void Gmres::apply_dense_impl(const matrix::Dense* dense_b, .residual(residual) .residual_norm(residual_norm) .solution(dense_x) - .check(RelativeStoppingId, true, &stop_status, &one_changed)) { + .check(RelativeStoppingId, false, &stop_status, &one_changed)) { break; } - if (restart_iter == krylov_dim) { // Restart // Solve upper triangular. // y = hessenberg \ residual_norm_collection + exec->run(gmres::make_solve_krylov(residual_norm_collection, + hessenberg, y, + final_iter_nums.get_const_data(), + stop_status.get_const_data())); // before_preconditioner = krylov_bases * y - exec->run(gmres::make_step_2(residual_norm_collection, krylov_bases, - hessenberg, y, before_preconditioner, - &final_iter_nums)); + exec->run(gmres::make_multi_axpy( + krylov_bases, y, before_preconditioner, + final_iter_nums.get_const_data(), stop_status.get_data())); // x = x + get_preconditioner() * before_preconditioner this->get_preconditioner()->apply(before_preconditioner, @@ -225,12 +265,13 @@ void Gmres::apply_dense_impl(const matrix::Dense* dense_b, this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, residual); // residual_norm = norm(residual) + residual->compute_norm2(residual_norm, reduction_tmp); // residual_norm_collection = {residual_norm, unchanged} // krylov_bases(:, 1) = residual / residual_norm // final_iter_nums = {0, ..., 0} - exec->run(gmres::make_initialize_2( + exec->run(gmres::make_restart( residual, residual_norm, residual_norm_collection, krylov_bases, - &final_iter_nums, reduction_tmp, krylov_dim)); + final_iter_nums.get_data())); restart_iter = 0; } auto this_krylov = krylov_bases->create_submatrix( @@ -240,32 +281,46 @@ void Gmres::apply_dense_impl(const matrix::Dense* dense_b, auto next_krylov = krylov_bases->create_submatrix( span{num_rows * (restart_iter + 1), num_rows * (restart_iter + 2)}, span{0, num_rhs}); - // preconditioned_vector =this->get_preconditioner * this_krylov + // preconditioned_vector = get_preconditioner() * this_krylov this->get_preconditioner()->apply(this_krylov.get(), preconditioned_vector); - // Do Arnoldi and givens rotation + // Create view of current column in the hessenberg matrix: + // hessenberg_iter = hessenberg(:, restart_iter); auto hessenberg_iter = hessenberg->create_submatrix( span{0, restart_iter + 2}, span{num_rhs * restart_iter, num_rhs * (restart_iter + 1)}); - // Start of arnoldi + // Start of Arnoldi // next_krylov = A * preconditioned_vector this->get_system_matrix()->apply(preconditioned_vector, next_krylov.get()); - // final_iter_nums += 1 (unconverged) - // next_krylov_basis is alias for (restart_iter + 1)-th krylov_bases - // for i in 0:restart_iter(include) - // hessenberg(restart_iter, i) = next_krylov_basis' * - // krylov_bases(:, i) - // next_krylov_basis -= hessenberg(restart_iter, i) * - // krylov_bases(:, i) - // end - // hessenberg(restart_iter+1, restart_iter) = norm(next_krylov_basis) - // next_krylov_basis /= hessenberg(restart_iter + 1, restart_iter) - // End of arnoldi - // Start apply givens rotation + for (size_type i = 0; i <= restart_iter; i++) { + // orthogonalize against krylov_bases(:, i): + // hessenberg(i, restart_iter) = next_krylov' * krylov_bases(:, i) + // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:, i) + auto hessenberg_entry = hessenberg_iter->create_submatrix( + span{i, i + 1}, span{0, num_rhs}); + auto krylov_basis = krylov_bases->create_submatrix( + span{num_rows * i, num_rows * (i + 1)}, span{0, num_rhs}); + next_krylov->compute_conj_dot( + krylov_basis.get(), hessenberg_entry.get(), reduction_tmp); + next_krylov->sub_scaled(hessenberg_entry.get(), krylov_basis.get()); + } + // normalize next_krylov: + // hessenberg(restart_iter+1, restart_iter) = norm(next_krylov) + // next_krylov /= hessenberg(restart_iter+1, restart_iter) + auto hessenberg_norm_entry = hessenberg_iter->create_submatrix( + span{restart_iter + 1, restart_iter + 2}, span{0, num_rhs}); + help_compute_norm::compute_next_krylov_norm_into_hessenberg( + next_krylov.get(), hessenberg_norm_entry.get(), + next_krylov_norm_tmp, reduction_tmp); + next_krylov->inv_scale(hessenberg_norm_entry.get()); + // End of Arnoldi + + // update QR factorization and Krylov RHS for last column: + // apply givens rotation // for j in 0:restart_iter(exclude) // temp = cos(j)*hessenberg(j) + // sin(j)*hessenberg(j+1) @@ -273,32 +328,27 @@ void Gmres::apply_dense_impl(const matrix::Dense* dense_b, // conj(cos(j))*hessenberg(j+1) // hessenberg(j) = temp; // end - // Calculate sin and cos + // calculate next Givens parameters // this_hess = hessenberg(restart_iter) // next_hess = hessenberg(restart_iter+1) - // hypotenuse = sqrt(this_hess * this_hess + next_hess * next_hess); - // cos(restart_iter) = conj(this_hess) / hypotenuse; - // sin(restart_iter) = conj(next_hess) / this_hess - // hessenberg(restart_iter) = - // cos(restart_iter)*hessenberg(restart_iter) + - // sin(restart_iter)*hessenberg(restart_iter) - // hessenberg(restart_iter+1) = 0 - // End apply givens rotation - // Calculate residual norm + // hypotenuse = ||(this_hess, next_hess)|| + // cos(restart_iter) = conj(this_hess) / hypotenuse + // sin(restart_iter) = conj(next_hess) / hypotenuse + // update Krylov approximation of b, apply new Givens rotation // this_rnc = residual_norm_collection(restart_iter) - // next_rnc = -conj(sin(restart_iter)) * this_rnc - // residual_norm_collection(restart_iter) = cos(restart_iter) * this_rnc - // residual_norm = abs(next_rnc) - // residual_norm_collection(restart_iter + 1) = next_rnc - exec->run(gmres::make_step_1( - dense_b->get_size()[0], givens_sin, givens_cos, residual_norm, - residual_norm_collection, krylov_bases, hessenberg_iter.get(), - restart_iter, &final_iter_nums, &stop_status)); + // residual_norm = abs(-conj(sin(restart_iter)) * this_rnc) + // residual_norm_collection(restart_iter) = + // cos(restart_iter) * this_rnc + // residual_norm_collection(restart_iter + 1) = + // -conj(sin(restart_iter)) * this_rnc + exec->run(gmres::make_hessenberg_qr( + givens_sin, givens_cos, residual_norm, residual_norm_collection, + hessenberg_iter.get(), restart_iter, final_iter_nums.get_data(), + stop_status.get_const_data())); restart_iter++; } - // Solve x auto krylov_bases_small = krylov_bases->create_submatrix( span{0, num_rows * (restart_iter + 1)}, span{0, num_rhs}); auto hessenberg_small = hessenberg->create_submatrix( @@ -306,10 +356,14 @@ void Gmres::apply_dense_impl(const matrix::Dense* dense_b, // Solve upper triangular. // y = hessenberg \ residual_norm_collection + exec->run(gmres::make_solve_krylov( + residual_norm_collection, hessenberg_small.get(), y, + final_iter_nums.get_const_data(), stop_status.get_const_data())); // before_preconditioner = krylov_bases * y - exec->run(gmres::make_step_2( - residual_norm_collection, krylov_bases_small.get(), - hessenberg_small.get(), y, before_preconditioner, &final_iter_nums)); + exec->run(gmres::make_multi_axpy( + krylov_bases_small.get(), y, before_preconditioner, + final_iter_nums.get_const_data(), stop_status.get_data())); + // x = x + get_preconditioner() * before_preconditioner this->get_preconditioner()->apply(before_preconditioner, after_preconditioner); @@ -345,7 +399,7 @@ int workspace_traits>::num_arrays(const Solver&) template int workspace_traits>::num_vectors(const Solver&) { - return 13; + return 14; } @@ -353,21 +407,20 @@ template std::vector workspace_traits>::op_names( const Solver&) { - return { - "residual", - "preconditioned_vector", - "krylov_bases", - "hessenberg", - "givens_sin", - "givens_cos", - "residual_norm_collection", - "residual_norm", - "y", - "before_preconditioner", - "after_preconditioner", - "one", - "minus_one", - }; + return {"residual", + "preconditioned_vector", + "krylov_bases", + "hessenberg", + "givens_sin", + "givens_cos", + "residual_norm_collection", + "residual_norm", + "y", + "before_preconditioner", + "after_preconditioner", + "one", + "minus_one", + "next_krylov_norm_tmp"}; } @@ -382,8 +435,10 @@ std::vector workspace_traits>::array_names( template std::vector workspace_traits>::scalars(const Solver&) { - return {hessenberg, givens_sin, givens_cos, residual_norm_collection, - residual_norm, y}; + return {hessenberg, givens_sin, + givens_cos, residual_norm_collection, + residual_norm, y, + next_krylov_norm_tmp}; } diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp index 462e3018b17..2e0f14a7b7e 100644 --- a/core/solver/gmres_kernels.hpp +++ b/core/solver/gmres_kernels.hpp @@ -49,55 +49,29 @@ namespace kernels { namespace gmres { -#define GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL(_type) \ - void initialize_1( \ - std::shared_ptr exec, \ - const matrix::Dense<_type>* b, matrix::Dense<_type>* residual, \ - matrix::Dense<_type>* givens_sin, matrix::Dense<_type>* givens_cos, \ - array* stop_status, size_type krylov_dim) - - -#define GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL(_type) \ - void initialize_2(std::shared_ptr exec, \ - const matrix::Dense<_type>* residual, \ - matrix::Dense>* residual_norm, \ - matrix::Dense<_type>* residual_norm_collection, \ - matrix::Dense<_type>* krylov_bases, \ - array* final_iter_nums, array& tmp, \ - size_type krylov_dim) - - -#define GKO_DECLARE_GMRES_STEP_1_KERNEL(_type) \ - void step_1(std::shared_ptr exec, \ - size_type num_rows, matrix::Dense<_type>* givens_sin, \ - matrix::Dense<_type>* givens_cos, \ - matrix::Dense>* residual_norm, \ - matrix::Dense<_type>* residual_norm_collection, \ - matrix::Dense<_type>* krylov_bases, \ - matrix::Dense<_type>* hessenberg_iter, size_type iter, \ - array* final_iter_nums, \ - const array* stop_status) - - -#define GKO_DECLARE_GMRES_STEP_2_KERNEL(_type) \ - void step_2(std::shared_ptr exec, \ - const matrix::Dense<_type>* residual_norm_collection, \ - const matrix::Dense<_type>* krylov_bases, \ - const matrix::Dense<_type>* hessenberg, \ - matrix::Dense<_type>* y, \ - matrix::Dense<_type>* before_preconditioner, \ - const array* final_iter_nums) - - -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL(ValueType); \ - template \ - GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL(ValueType); \ - template \ - GKO_DECLARE_GMRES_STEP_1_KERNEL(ValueType); \ - template \ - GKO_DECLARE_GMRES_STEP_2_KERNEL(ValueType) +#define GKO_DECLARE_GMRES_RESTART_KERNEL(_type) \ + void restart(std::shared_ptr exec, \ + const matrix::Dense<_type>* residual, \ + const matrix::Dense>* residual_norm, \ + matrix::Dense<_type>* residual_norm_collection, \ + matrix::Dense<_type>* krylov_bases, \ + size_type* final_iter_nums) + + +#define GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL(_type) \ + void multi_axpy(std::shared_ptr exec, \ + const matrix::Dense<_type>* krylov_bases, \ + const matrix::Dense<_type>* y, \ + matrix::Dense<_type>* before_preconditioner, \ + const size_type* final_iter_nums, \ + stopping_status* stop_status) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_GMRES_RESTART_KERNEL(ValueType); \ + template \ + GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL(ValueType) } // namespace gmres diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index ac4e5c5bb8c..46946b3b696 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -41,7 +41,6 @@ target_sources(ginkgo_cuda preconditioner/jacobi_kernels.cu preconditioner/jacobi_simple_apply_kernel.cu reorder/rcm_kernels.cu - solver/gmres_kernels.cu solver/cb_gmres_kernels.cu solver/idr_kernels.cu solver/lower_trs_kernels.cu diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh index 257948b3d4d..d406e91e40a 100644 --- a/cuda/base/kernel_launch.cuh +++ b/cuda/base/kernel_launch.cuh @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "accessor/cuda_helper.hpp" #include "cuda/base/types.hpp" #include "cuda/components/thread_ids.cuh" @@ -48,6 +49,27 @@ namespace kernels { namespace cuda { +template +struct to_device_type_impl&> { + using type = std::decay_t>()))>; + static type map_to_device(gko::acc::range& range) + { + return gko::acc::as_cuda_range(range); + } +}; + +template +struct to_device_type_impl&> { + using type = std::decay_t>()))>; + static type map_to_device(const gko::acc::range& range) + { + return gko::acc::as_cuda_range(range); + } +}; + + namespace device_std = thrust; diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu index 6c7f36812df..f27e4780918 100644 --- a/cuda/solver/cb_gmres_kernels.cu +++ b/cuda/solver/cb_gmres_kernels.cu @@ -89,12 +89,12 @@ void zero_matrix(size_type m, size_type n, size_type stride, ValueType* array) template -void initialize_1(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) +void initialize(std::shared_ptr exec, + const matrix::Dense* b, + matrix::Dense* residual, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + array* stop_status, size_type krylov_dim) { const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), krylov_dim * b->get_size()[1]); @@ -102,7 +102,7 @@ void initialize_1(std::shared_ptr exec, const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; - initialize_1_kernel<<>>( + initialize_kernel<<>>( b->get_size()[0], b->get_size()[1], krylov_dim, as_cuda_type(b->get_const_values()), b->get_stride(), as_cuda_type(residual->get_values()), residual->get_stride(), @@ -111,19 +111,19 @@ void initialize_1(std::shared_ptr exec, as_cuda_type(stop_status->get_data())); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template -void initialize_2(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense>* arnoldi_norm, - Accessor3d krylov_bases, - matrix::Dense* next_krylov_basis, - array* final_iter_nums, array& tmp, - size_type krylov_dim) +void restart(std::shared_ptr exec, + const matrix::Dense* residual, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + matrix::Dense>* arnoldi_norm, + Accessor3d krylov_bases, + matrix::Dense* next_krylov_basis, + array* final_iter_nums, array& reduction_tmp, + size_type krylov_dim) { constexpr bool use_scalar = gko::cb_gmres::detail::has_3d_scaled_accessor::value; @@ -138,13 +138,13 @@ void initialize_2(std::shared_ptr exec, constexpr auto block_size = default_block_size; const auto stride_arnoldi = arnoldi_norm->get_stride(); - initialize_2_1_kernel<<>>( + restart_1_kernel<<>>( residual->get_size()[0], residual->get_size()[1], krylov_dim, acc::as_cuda_range(krylov_bases), as_cuda_type(residual_norm_collection->get_values()), residual_norm_collection->get_stride()); kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm, - tmp); + reduction_tmp); if (use_scalar) { components::fill_array(exec, @@ -174,7 +174,7 @@ void initialize_2(std::shared_ptr exec, const auto grid_dim_2 = ceildiv(std::max(num_rows, 1) * krylov_stride[1], default_block_size); - initialize_2_2_kernel<<>>( + restart_2_kernel<<>>( residual->get_size()[0], residual->get_size()[1], as_cuda_type(residual->get_const_values()), residual->get_stride(), as_cuda_type(residual_norm->get_const_values()), @@ -185,8 +185,7 @@ void initialize_2(std::shared_ptr exec, as_cuda_type(final_iter_nums->get_data())); } -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE( - GKO_DECLARE_CB_GMRES_INITIALIZE_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); template @@ -395,18 +394,19 @@ void givens_rotation(std::shared_ptr exec, template -void step_1(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, array* final_iter_nums, - const array* stop_status, - array* reorth_status, array* num_reorth) +void arnoldi(std::shared_ptr exec, + matrix::Dense* next_krylov_basis, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, + matrix::Dense* buffer_iter, + matrix::Dense>* arnoldi_norm, + size_type iter, array* final_iter_nums, + const array* stop_status, + array* reorth_status, + array* num_reorth) { increase_final_iteration_numbers_kernel<<< static_cast( @@ -422,7 +422,7 @@ void step_1(std::shared_ptr exec, residual_norm, residual_norm_collection, iter, stop_status); } -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL); template @@ -477,13 +477,13 @@ void calculate_qy(ConstAccessor3d krylov_bases, size_type num_krylov_bases, template -void step_2(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - ConstAccessor3d krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) +void solve_krylov(std::shared_ptr exec, + const matrix::Dense* residual_norm_collection, + ConstAccessor3d krylov_bases, + const matrix::Dense* hessenberg, + matrix::Dense* y, + matrix::Dense* before_preconditioner, + const array* final_iter_nums) { if (before_preconditioner->get_size()[1] == 0) { return; @@ -500,7 +500,7 @@ void step_2(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE( - GKO_DECLARE_CB_GMRES_STEP_2_KERNEL); + GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL); } // namespace cb_gmres diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu deleted file mode 100644 index 2ea37432ab3..00000000000 --- a/cuda/solver/gmres_kernels.cu +++ /dev/null @@ -1,340 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include "core/solver/gmres_kernels.hpp" - - -#include - - -#include -#include -#include -#include - - -#include "core/components/fill_array_kernels.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The GMRES solver namespace. - * - * @ingroup gmres - */ -namespace gmres { - - -constexpr int default_block_size = 512; -// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block -// size limit. -constexpr int default_dot_dim = 32; -constexpr int default_dot_size = default_dot_dim * default_dot_dim; - - -#include "common/cuda_hip/solver/gmres_kernels.hpp.inc" - - -template -void initialize_1(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) -{ - const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), - krylov_dim * b->get_size()[1]); - const auto grid_dim = ceildiv(num_threads, default_block_size); - const auto block_dim = default_block_size; - constexpr auto block_size = default_block_size; - - initialize_1_kernel<<>>( - b->get_size()[0], b->get_size()[1], krylov_dim, - as_cuda_type(b->get_const_values()), b->get_stride(), - as_cuda_type(residual->get_values()), residual->get_stride(), - as_cuda_type(givens_sin->get_values()), givens_sin->get_stride(), - as_cuda_type(givens_cos->get_values()), givens_cos->get_stride(), - as_cuda_type(stop_status->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL); - - -template -void initialize_2(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense* krylov_bases, - array* final_iter_nums, array& tmp, - size_type krylov_dim) -{ - const auto num_rows = residual->get_size()[0]; - const auto num_rhs = residual->get_size()[1]; - const auto grid_dim_1 = - ceildiv(krylov_bases->get_size()[0] * krylov_bases->get_stride(), - default_block_size); - const auto block_dim = default_block_size; - constexpr auto block_size = default_block_size; - - kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm, - tmp); - - const auto grid_dim_2 = - ceildiv(std::max(num_rows, 1) * num_rhs, default_block_size); - initialize_2_2_kernel<<>>( - residual->get_size()[0], residual->get_size()[1], - as_cuda_type(residual->get_const_values()), residual->get_stride(), - as_cuda_type(residual_norm->get_const_values()), - as_cuda_type(residual_norm_collection->get_values()), - as_cuda_type(krylov_bases->get_values()), krylov_bases->get_stride(), - as_cuda_type(final_iter_nums->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL); - - -template -void finish_arnoldi(std::shared_ptr exec, - size_type num_rows, matrix::Dense* krylov_bases, - matrix::Dense* hessenberg_iter, size_type iter, - const stopping_status* stop_status) -{ - if (hessenberg_iter->get_size()[1] == 0) { - return; - } - const auto stride_krylov = krylov_bases->get_stride(); - const auto stride_hessenberg = hessenberg_iter->get_stride(); - auto cublas_handle = exec->get_cublas_handle(); - const dim3 grid_size( - ceildiv(hessenberg_iter->get_size()[1], default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 block_size(default_dot_dim, default_dot_dim); - auto next_krylov_basis = - krylov_bases->get_values() + - (iter + 1) * num_rows * hessenberg_iter->get_size()[1]; - for (size_type k = 0; k < iter + 1; ++k) { - const auto k_krylov_bases = - krylov_bases->get_const_values() + - k * num_rows * hessenberg_iter->get_size()[1]; - if (hessenberg_iter->get_size()[1] > 1) { - // TODO: this condition should be tuned - // single rhs will use vendor's dot, otherwise, use our own - // multidot_kernel which parallelize multiple rhs. - components::fill_array( - exec, hessenberg_iter->get_values() + k * stride_hessenberg, - hessenberg_iter->get_size()[1], zero()); - multidot_kernel<<>>( - k, num_rows, hessenberg_iter->get_size()[1], - as_cuda_type(k_krylov_bases), as_cuda_type(next_krylov_basis), - stride_krylov, as_cuda_type(hessenberg_iter->get_values()), - stride_hessenberg, as_cuda_type(stop_status)); - } else { - cublas::dot(exec->get_cublas_handle(), num_rows, k_krylov_bases, - stride_krylov, next_krylov_basis, stride_krylov, - hessenberg_iter->get_values() + k * stride_hessenberg); - } - update_next_krylov_kernel - <<>>( - k, num_rows, hessenberg_iter->get_size()[1], - as_cuda_type(k_krylov_bases), as_cuda_type(next_krylov_basis), - stride_krylov, - as_cuda_type(hessenberg_iter->get_const_values()), - stride_hessenberg, as_cuda_type(stop_status)); - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - - - update_hessenberg_2_kernel - <<get_size()[1], default_block_size>>>( - iter, num_rows, hessenberg_iter->get_size()[1], - as_cuda_type(next_krylov_basis), stride_krylov, - as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg, - as_cuda_type(stop_status)); - - update_krylov_kernel - <<>>( - iter, num_rows, hessenberg_iter->get_size()[1], - as_cuda_type(next_krylov_basis), stride_krylov, - as_cuda_type(hessenberg_iter->get_const_values()), - stride_hessenberg, as_cuda_type(stop_status)); - // next_krylov_basis /= hessenberg(iter, iter + 1) - // End of arnoldi -} - - -template -void givens_rotation(std::shared_ptr exec, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense* hessenberg_iter, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - size_type iter, const array* stop_status) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_cols = hessenberg_iter->get_size()[1]; - const auto block_dim = block_size; - const auto grid_dim = - static_cast(ceildiv(num_cols, block_size)); - - givens_rotation_kernel<<>>( - hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], iter, - as_cuda_type(hessenberg_iter->get_values()), - hessenberg_iter->get_stride(), as_cuda_type(givens_sin->get_values()), - givens_sin->get_stride(), as_cuda_type(givens_cos->get_values()), - givens_cos->get_stride(), as_cuda_type(residual_norm->get_values()), - as_cuda_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride(), - as_cuda_type(stop_status->get_const_data())); -} - - -template -void step_1(std::shared_ptr exec, size_type num_rows, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense* krylov_bases, - matrix::Dense* hessenberg_iter, size_type iter, - array* final_iter_nums, - const array* stop_status) -{ - increase_final_iteration_numbers_kernel<<< - static_cast( - ceildiv(final_iter_nums->get_num_elems(), default_block_size)), - default_block_size>>>(as_cuda_type(final_iter_nums->get_data()), - as_cuda_type(stop_status->get_const_data()), - final_iter_nums->get_num_elems()); - finish_arnoldi(exec, num_rows, krylov_bases, hessenberg_iter, iter, - stop_status->get_const_data()); - givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter, - residual_norm, residual_norm_collection, iter, stop_status); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_1_KERNEL); - - -template -void solve_upper_triangular( - const matrix::Dense* residual_norm_collection, - const matrix::Dense* hessenberg, matrix::Dense* y, - const array* final_iter_nums) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_rhs = residual_norm_collection->get_size()[1]; - const auto block_dim = block_size; - const auto grid_dim = - static_cast(ceildiv(num_rhs, block_size)); - - solve_upper_triangular_kernel<<>>( - hessenberg->get_size()[1], num_rhs, - as_cuda_type(residual_norm_collection->get_const_values()), - residual_norm_collection->get_stride(), - as_cuda_type(hessenberg->get_const_values()), hessenberg->get_stride(), - as_cuda_type(y->get_values()), y->get_stride(), - as_cuda_type(final_iter_nums->get_const_data())); -} - - -template -void calculate_qy(const matrix::Dense* krylov_bases, - const matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - const auto num_rows = before_preconditioner->get_size()[0]; - const auto num_cols = krylov_bases->get_size()[1]; - const auto num_rhs = before_preconditioner->get_size()[1]; - const auto stride_before_preconditioner = - before_preconditioner->get_stride(); - - constexpr auto block_size = default_block_size; - const auto grid_dim = static_cast( - ceildiv(num_rows * stride_before_preconditioner, block_size)); - const auto block_dim = block_size; - - - calculate_Qy_kernel<<>>( - num_rows, num_cols, num_rhs, - as_cuda_type(krylov_bases->get_const_values()), - krylov_bases->get_stride(), as_cuda_type(y->get_const_values()), - y->get_stride(), as_cuda_type(before_preconditioner->get_values()), - stride_before_preconditioner, - as_cuda_type(final_iter_nums->get_const_data())); - // Calculate qy - // before_preconditioner = krylov_bases * y -} - - -template -void step_2(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - const matrix::Dense* krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - solve_upper_triangular(residual_norm_collection, hessenberg, y, - final_iter_nums); - calculate_qy(krylov_bases, y, before_preconditioner, final_iter_nums); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_2_KERNEL); - - -} // namespace gmres -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index c1fae0ede26..55d708a8980 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -49,7 +49,6 @@ target_sources(ginkgo_dpcpp preconditioner/jacobi_kernels.dp.cpp preconditioner/jacobi_simple_apply_kernel.dp.cpp reorder/rcm_kernels.dp.cpp - solver/gmres_kernels.dp.cpp solver/cb_gmres_kernels.dp.cpp solver/idr_kernels.dp.cpp solver/lower_trs_kernels.dp.cpp diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index 539f35f3c13..72b496c0eb3 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -99,11 +99,10 @@ GKO_ENABLE_DEFAULT_HOST(zero_matrix_kernel, zero_matrix_kernel); // Must be called with at least `num_rows * stride_krylov` threads in total. template -void initialize_2_1_kernel(size_type num_rows, size_type num_rhs, - size_type krylov_dim, Accessor3d krylov_bases, - ValueType* __restrict__ residual_norm_collection, - size_type stride_residual_nc, - sycl::nd_item<3> item_ct1) +void restart_1_kernel(size_type num_rows, size_type num_rhs, + size_type krylov_dim, Accessor3d krylov_bases, + ValueType* __restrict__ residual_norm_collection, + size_type stride_residual_nc, sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); const auto krylov_stride = @@ -130,17 +129,16 @@ void initialize_2_1_kernel(size_type num_rows, size_type num_rhs, } template -void initialize_2_1_kernel(dim3 grid, dim3 block, - size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, size_type num_rhs, - size_type krylov_dim, Accessor3d krylov_bases, - ValueType* residual_norm_collection, - size_type stride_residual_nc) +void restart_1_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, + sycl::queue* queue, size_type num_rows, size_type num_rhs, + size_type krylov_dim, Accessor3d krylov_bases, + ValueType* residual_norm_collection, + size_type stride_residual_nc) { queue->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - initialize_2_1_kernel( + restart_1_kernel( num_rows, num_rhs, krylov_dim, krylov_bases, residual_norm_collection, stride_residual_nc, item_ct1); }); @@ -150,7 +148,7 @@ void initialize_2_1_kernel(dim3 grid, dim3 block, // Must be called with at least `num_rows * num_rhs` threads in total. template -void initialize_2_2_kernel( +void restart_2_kernel( size_type num_rows, size_type num_rhs, const ValueType* __restrict__ residual, size_type stride_residual, const remove_complex* __restrict__ residual_norm, @@ -179,18 +177,18 @@ void initialize_2_2_kernel( } template -void initialize_2_2_kernel( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, size_type num_rhs, const ValueType* residual, - size_type stride_residual, const remove_complex* residual_norm, - ValueType* residual_norm_collection, Accessor3d krylov_bases, - ValueType* next_krylov_basis, size_type stride_next_krylov, - size_type* final_iter_nums) +void restart_2_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, + sycl::queue* queue, size_type num_rows, size_type num_rhs, + const ValueType* residual, size_type stride_residual, + const remove_complex* residual_norm, + ValueType* residual_norm_collection, + Accessor3d krylov_bases, ValueType* next_krylov_basis, + size_type stride_next_krylov, size_type* final_iter_nums) { queue->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - initialize_2_2_kernel( + restart_2_kernel( num_rows, num_rhs, residual, stride_residual, residual_norm, residual_norm_collection, krylov_bases, next_krylov_basis, stride_next_krylov, final_iter_nums, item_ct1); @@ -959,12 +957,12 @@ void zero_matrix(std::shared_ptr exec, size_type m, template -void initialize_1(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) +void initialize(std::shared_ptr exec, + const matrix::Dense* b, + matrix::Dense* residual, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + array* stop_status, size_type krylov_dim) { const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), krylov_dim * b->get_size()[1]); @@ -972,7 +970,7 @@ void initialize_1(std::shared_ptr exec, const dim3 block_dim(default_block_size, 1, 1); constexpr auto block_size = default_block_size; - initialize_1_kernel( + initialize_kernel( grid_dim, block_dim, 0, exec->get_queue(), b->get_size()[0], b->get_size()[1], krylov_dim, b->get_const_values(), b->get_stride(), residual->get_values(), residual->get_stride(), @@ -981,19 +979,19 @@ void initialize_1(std::shared_ptr exec, stop_status->get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template -void initialize_2(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense>* arnoldi_norm, - Accessor3d krylov_bases, - matrix::Dense* next_krylov_basis, - array* final_iter_nums, array& tmp, - size_type krylov_dim) +void restart(std::shared_ptr exec, + const matrix::Dense* residual, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + matrix::Dense>* arnoldi_norm, + Accessor3d krylov_bases, + matrix::Dense* next_krylov_basis, + array* final_iter_nums, array& reduction_tmp, + size_type krylov_dim) { constexpr bool use_scalar = gko::cb_gmres::detail::has_3d_scaled_accessor::value; @@ -1008,13 +1006,13 @@ void initialize_2(std::shared_ptr exec, constexpr auto block_size = default_block_size; const auto stride_arnoldi = arnoldi_norm->get_stride(); - initialize_2_1_kernel( + restart_1_kernel( grid_dim_1, block_dim, 0, exec->get_queue(), residual->get_size()[0], residual->get_size()[1], krylov_dim, krylov_bases, residual_norm_collection->get_values(), residual_norm_collection->get_stride()); kernels::dpcpp::dense::compute_norm2_dispatch(exec, residual, residual_norm, - tmp); + reduction_tmp); if (use_scalar) { components::fill_array(exec, @@ -1042,7 +1040,7 @@ void initialize_2(std::shared_ptr exec, ceildiv(std::max(num_rows, 1) * krylov_stride[1], default_block_size), 1, 1); - initialize_2_2_kernel( + restart_2_kernel( grid_dim_2, block_dim, 0, exec->get_queue(), residual->get_size()[0], residual->get_size()[1], residual->get_const_values(), residual->get_stride(), residual_norm->get_const_values(), @@ -1051,8 +1049,7 @@ void initialize_2(std::shared_ptr exec, final_iter_nums->get_data()); } -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE( - GKO_DECLARE_CB_GMRES_INITIALIZE_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); template @@ -1244,18 +1241,19 @@ void givens_rotation(std::shared_ptr exec, template -void step_1(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, array* final_iter_nums, - const array* stop_status, - array* reorth_status, array* num_reorth) +void arnoldi(std::shared_ptr exec, + matrix::Dense* next_krylov_basis, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, + matrix::Dense* buffer_iter, + matrix::Dense>* arnoldi_norm, + size_type iter, array* final_iter_nums, + const array* stop_status, + array* reorth_status, + array* num_reorth) { increase_final_iteration_numbers_kernel( static_cast( @@ -1270,7 +1268,7 @@ void step_1(std::shared_ptr exec, residual_norm, residual_norm_collection, iter, stop_status); } -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL); template @@ -1327,13 +1325,13 @@ void calculate_qy(std::shared_ptr exec, template -void step_2(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - ConstAccessor3d krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) +void solve_krylov(std::shared_ptr exec, + const matrix::Dense* residual_norm_collection, + ConstAccessor3d krylov_bases, + const matrix::Dense* hessenberg, + matrix::Dense* y, + matrix::Dense* before_preconditioner, + const array* final_iter_nums) { if (before_preconditioner->get_size()[1] == 0) { return; @@ -1350,7 +1348,7 @@ void step_2(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE( - GKO_DECLARE_CB_GMRES_STEP_2_KERNEL); + GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL); } // namespace cb_gmres diff --git a/dpcpp/solver/common_gmres_kernels.dp.inc b/dpcpp/solver/common_gmres_kernels.dp.inc index b850e60f09b..2005cc618ce 100644 --- a/dpcpp/solver/common_gmres_kernels.dp.inc +++ b/dpcpp/solver/common_gmres_kernels.dp.inc @@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Must be called with at least `max(stride_b * num_rows, krylov_dim * // num_cols)` threads in total. template -void initialize_1_kernel( +void initialize_kernel( size_type num_rows, size_type num_cols, size_type krylov_dim, const ValueType *__restrict__ b, size_type stride_b, ValueType *__restrict__ residual, size_type stride_residual, @@ -66,7 +66,7 @@ void initialize_1_kernel( } template -void initialize_1_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, +void initialize_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue *queue, size_type num_rows, size_type num_cols, size_type krylov_dim, const ValueType *b, size_type stride_b, @@ -78,7 +78,7 @@ void initialize_1_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, queue->submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - initialize_1_kernel( + initialize_kernel( num_rows, num_cols, krylov_dim, b, stride_b, residual, stride_residual, givens_sin, stride_sin, givens_cos, stride_cos, stop_status, item_ct1); diff --git a/dpcpp/solver/gmres_kernels.dp.cpp b/dpcpp/solver/gmres_kernels.dp.cpp deleted file mode 100644 index dd207fce53e..00000000000 --- a/dpcpp/solver/gmres_kernels.dp.cpp +++ /dev/null @@ -1,698 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include "core/solver/gmres_kernels.hpp" - - -#include - - -#include - - -#include -#include -#include -#include - - -#include "core/components/fill_array_kernels.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "dpcpp/base/config.hpp" -#include "dpcpp/base/dim3.dp.hpp" -#include "dpcpp/base/onemkl_bindings.hpp" -#include "dpcpp/components/atomic.dp.hpp" -#include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/reduction.dp.hpp" -#include "dpcpp/components/thread_ids.dp.hpp" -#include "dpcpp/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace dpcpp { -/** - * @brief The GMRES solver namespace. - * - * @ingroup gmres - */ -namespace gmres { - - -constexpr int default_block_size = 256; -constexpr int default_dot_dim = 16; -constexpr int default_dot_size = default_dot_dim * default_dot_dim; - - -#include "dpcpp/solver/common_gmres_kernels.dp.inc" - - -// Must be called with at least `num_rows * num_rhs` threads in total. -template -void initialize_2_2_kernel( - size_type num_rows, size_type num_rhs, - const ValueType* __restrict__ residual, size_type stride_residual, - const remove_complex* __restrict__ residual_norm, - ValueType* __restrict__ residual_norm_collection, - ValueType* __restrict__ krylov_bases, size_type stride_krylov, - size_type* __restrict__ final_iter_nums, sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_idx = global_id / num_rhs; - const auto col_idx = global_id % num_rhs; - - if (global_id < num_rhs) { - residual_norm_collection[global_id] = residual_norm[global_id]; - final_iter_nums[global_id] = 0; - } - - if (row_idx < num_rows && col_idx < num_rhs) { - auto value = residual[row_idx * stride_residual + col_idx] / - residual_norm[col_idx]; - krylov_bases[row_idx * stride_krylov + col_idx] = value; - } -} - -template -void initialize_2_2_kernel(dim3 grid, dim3 block, - size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, size_type num_rhs, - const ValueType* residual, size_type stride_residual, - const remove_complex* residual_norm, - ValueType* residual_norm_collection, - ValueType* krylov_bases, size_type stride_krylov, - size_type* final_iter_nums) -{ - if (num_rhs == 0) { - return; - } - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - initialize_2_2_kernel( - num_rows, num_rhs, residual, stride_residual, residual_norm, - residual_norm_collection, krylov_bases, stride_krylov, - final_iter_nums, item_ct1); - }); - }); -} - - -void increase_final_iteration_numbers_kernel( - size_type* __restrict__ final_iter_nums, - const stopping_status* __restrict__ stop_status, size_type total_number, - sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - if (global_id < total_number) { - final_iter_nums[global_id] += !stop_status[global_id].has_stopped(); - } -} - -GKO_ENABLE_DEFAULT_HOST(increase_final_iteration_numbers_kernel, - increase_final_iteration_numbers_kernel); - - -template -void multidot_kernel( - size_type k, size_type num_rows, size_type num_cols, - const ValueType* __restrict__ krylov_bases, - const ValueType* __restrict__ next_krylov_basis, size_type stride_krylov, - ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - uninitialized_array* - reduction_helper_array) -{ - const auto tidx = item_ct1.get_local_id(2); - const auto tidy = item_ct1.get_local_id(1); - const auto col_idx = item_ct1.get_group(2) * default_dot_dim + tidx; - const auto num = ceildiv(num_rows, item_ct1.get_group_range(1)); - const auto start_row = item_ct1.get_group(1) * num; - const auto end_row = ((item_ct1.get_group(1) + 1) * num > num_rows) - ? num_rows - : (item_ct1.get_group(1) + 1) * num; - - // Used that way to get around dynamic initialization warning and - // template error when using `reduction_helper_array` directly in `reduce` - ValueType* __restrict__ reduction_helper = (*reduction_helper_array); - - ValueType local_res = zero(); - if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { - for (size_type i = start_row + tidy; i < end_row; - i += default_dot_dim) { - const auto krylov_idx = i * stride_krylov + col_idx; - local_res += - conj(krylov_bases[krylov_idx]) * next_krylov_basis[krylov_idx]; - } - } - reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res; - item_ct1.barrier(sycl::access::fence_space::local_space); - local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; - const auto tile_block = group::tiled_partition( - group::this_thread_block(item_ct1)); - const auto sum = ::gko::kernels::dpcpp::reduce( - tile_block, local_res, - [](const ValueType& a, const ValueType& b) { return a + b; }); - const auto new_col_idx = item_ct1.get_group(2) * default_dot_dim + tidy; - if (tidx == 0 && new_col_idx < num_cols && - !stop_status[new_col_idx].has_stopped()) { - const auto hessenberg_idx = k * stride_hessenberg + new_col_idx; - atomic_add(hessenberg_iter + hessenberg_idx, sum); - } -} - -template -void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type k, size_type num_rows, - size_type num_cols, const ValueType* krylov_bases, - const ValueType* next_krylov_basis, - size_type stride_krylov, ValueType* hessenberg_iter, - size_type stride_hessenberg, - const stopping_status* stop_status) -{ - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, - 0, sycl::access_mode::read_write, - sycl::access::target::local> - reduction_helper_array_acc_ct1(cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - default_dot_dim)]] { - multidot_kernel( - k, num_rows, num_cols, krylov_bases, next_krylov_basis, - stride_krylov, hessenberg_iter, stride_hessenberg, - stop_status, item_ct1, - (uninitialized_array*) - reduction_helper_array_acc_ct1.get_pointer()); - }); - }); -} - - -// Must be called with at least `num_rows * stride_next_krylov` threads in -// total. -template -void update_next_krylov_kernel( - size_type k, size_type num_rows, size_type num_cols, - const ValueType* __restrict__ krylov_bases, - ValueType* __restrict__ next_krylov_basis, size_type stride_krylov, - const ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_idx = global_id / stride_krylov; - const auto col_idx = global_id % stride_krylov; - - if (row_idx < num_rows && col_idx < num_cols && - !stop_status[col_idx].has_stopped()) { - const auto next_krylov_idx = row_idx * stride_krylov + col_idx; - const auto krylov_idx = row_idx * stride_krylov + col_idx; - const auto hessenberg_idx = k * stride_hessenberg + col_idx; - - next_krylov_basis[next_krylov_idx] -= - hessenberg_iter[hessenberg_idx] * krylov_bases[krylov_idx]; - } -} - -template -void update_next_krylov_kernel( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - size_type k, size_type num_rows, size_type num_cols, - const ValueType* krylov_bases, ValueType* next_krylov_basis, - size_type stride_krylov, const ValueType* hessenberg_iter, - size_type stride_hessenberg, const stopping_status* stop_status) -{ - if (stride_krylov == 0) { - return; - } - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - update_next_krylov_kernel( - k, num_rows, num_cols, krylov_bases, next_krylov_basis, - stride_krylov, hessenberg_iter, stride_hessenberg, - stop_status, item_ct1); - }); - }); -} - - -// Must be called with at least `num_cols` blocks, each with `block_size` -// threads. `block_size` must be a power of 2. -template -void update_hessenberg_2_kernel( - size_type iter, size_type num_rows, size_type num_cols, - const ValueType* __restrict__ next_krylov_basis, - size_type stride_next_krylov, ValueType* __restrict__ hessenberg_iter, - size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - uninitialized_array& reduction_helper_array) -{ - const auto tidx = item_ct1.get_local_id(2); - const auto col_idx = item_ct1.get_group(2); - - // Used that way to get around dynamic initialization warning and - // template error when using `reduction_helper_array` directly in `reduce` - - ValueType* __restrict__ reduction_helper = reduction_helper_array; - - if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { - ValueType local_res{}; - for (size_type i = tidx; i < num_rows; i += block_size) { - const auto next_krylov_idx = i * stride_next_krylov + col_idx; - const auto next_krylov_value = next_krylov_basis[next_krylov_idx]; - - local_res += next_krylov_value * next_krylov_value; - } - - reduction_helper[tidx] = local_res; - - // Perform thread block reduction. Result is in reduction_helper[0] - ::gko::kernels::dpcpp::reduce( - group::this_thread_block(item_ct1), reduction_helper, - [](const ValueType& a, const ValueType& b) { return a + b; }); - - if (tidx == 0) { - hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx] = - std::sqrt(reduction_helper[0]); - } - } -} - -template -void update_hessenberg_2_kernel( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - size_type iter, size_type num_rows, size_type num_cols, - const ValueType* next_krylov_basis, size_type stride_next_krylov, - ValueType* hessenberg_iter, size_type stride_hessenberg, - const stopping_status* stop_status) -{ - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, - sycl::access::target::local> - reduction_helper_array_acc_ct1(cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - update_hessenberg_2_kernel( - iter, num_rows, num_cols, next_krylov_basis, - stride_next_krylov, hessenberg_iter, stride_hessenberg, - stop_status, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); - }); - }); -} - - -// Must be called with at least `num_rows * stride_krylov` threads in -// total. -template -void update_krylov_kernel( - size_type iter, size_type num_rows, size_type num_cols, - ValueType* __restrict__ krylov_bases, size_type stride_krylov, - const ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_idx = global_id / stride_krylov; - const auto col_idx = global_id % stride_krylov; - const auto hessenberg = - hessenberg_iter[(iter + 1) * stride_hessenberg + col_idx]; - - if (row_idx < num_rows && col_idx < num_cols && - !stop_status[col_idx].has_stopped()) { - const auto krylov_idx = row_idx * stride_krylov + col_idx; - - krylov_bases[krylov_idx] /= hessenberg; - } -} - -template -void update_krylov_kernel(dim3 grid, dim3 block, - size_type dynamic_shared_memory, sycl::queue* queue, - size_type iter, size_type num_rows, - size_type num_cols, ValueType* krylov_bases, - size_type stride_krylov, - const ValueType* hessenberg_iter, - size_type stride_hessenberg, - const stopping_status* stop_status) -{ - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - update_krylov_kernel( - iter, num_rows, num_cols, krylov_bases, stride_krylov, - hessenberg_iter, stride_hessenberg, stop_status, item_ct1); - }); - }); -} - - -// Must be called with at least `stride_preconditioner * num_rows` threads in -// total. -template -void calculate_Qy_kernel(size_type num_rows, size_type num_cols, - size_type num_rhs, - const ValueType* __restrict__ krylov_bases, - size_type stride_krylov, - const ValueType* __restrict__ y, size_type stride_y, - ValueType* __restrict__ before_preconditioner, - size_type stride_preconditioner, - const size_type* __restrict__ final_iter_nums, - sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_id = global_id / stride_preconditioner; - const auto col_id = global_id % stride_preconditioner; - - if (row_id < num_rows && col_id < num_cols) { - ValueType temp = zero(); - - for (size_type j = 0; j < final_iter_nums[col_id]; ++j) { - temp += - krylov_bases[(row_id + j * num_rows) * stride_krylov + col_id] * - y[j * stride_y + col_id]; - } - before_preconditioner[global_id] = temp; - } -} - -template -void calculate_Qy_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, - size_type num_cols, size_type num_rhs, - const ValueType* krylov_bases, size_type stride_krylov, - const ValueType* y, size_type stride_y, - ValueType* before_preconditioner, - size_type stride_preconditioner, - const size_type* final_iter_nums) -{ - if (stride_preconditioner == 0) { - return; - } - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - calculate_Qy_kernel( - num_rows, num_cols, num_rhs, krylov_bases, stride_krylov, y, - stride_y, before_preconditioner, stride_preconditioner, - final_iter_nums, item_ct1); - }); - }); -} - - -template -void initialize_1(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) -{ - const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), - krylov_dim * b->get_size()[1]); - const dim3 grid_dim(ceildiv(num_threads, default_block_size), 1, 1); - const dim3 block_dim(default_block_size, 1, 1); - constexpr auto block_size = default_block_size; - - initialize_1_kernel( - grid_dim, block_dim, 0, exec->get_queue(), b->get_size()[0], - b->get_size()[1], krylov_dim, b->get_const_values(), b->get_stride(), - residual->get_values(), residual->get_stride(), - givens_sin->get_values(), givens_sin->get_stride(), - givens_cos->get_values(), givens_cos->get_stride(), - stop_status->get_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL); - - -template -void initialize_2(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense* krylov_bases, - array* final_iter_nums, array& tmp, - size_type krylov_dim) -{ - const auto num_rows = residual->get_size()[0]; - const auto num_rhs = residual->get_size()[1]; - const dim3 grid_dim_1( - ceildiv(krylov_bases->get_size()[0] * krylov_bases->get_stride(), - default_block_size), - 1, 1); - const dim3 block_dim(default_block_size, 1, 1); - constexpr auto block_size = default_block_size; - - kernels::dpcpp::dense::compute_norm2_dispatch(exec, residual, residual_norm, - tmp); - - const dim3 grid_dim_2( - ceildiv(std::max(num_rows, 1) * num_rhs, default_block_size), - 1, 1); - initialize_2_2_kernel( - grid_dim_2, block_dim, 0, exec->get_queue(), residual->get_size()[0], - residual->get_size()[1], residual->get_const_values(), - residual->get_stride(), residual_norm->get_const_values(), - residual_norm_collection->get_values(), krylov_bases->get_values(), - krylov_bases->get_stride(), final_iter_nums->get_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL); - - -template -void finish_arnoldi(std::shared_ptr exec, - size_type num_rows, matrix::Dense* krylov_bases, - matrix::Dense* hessenberg_iter, size_type iter, - const stopping_status* stop_status) -{ - if (hessenberg_iter->get_size()[1] == 0) { - return; - } - const auto stride_krylov = krylov_bases->get_stride(); - const auto stride_hessenberg = hessenberg_iter->get_stride(); - // auto cublas_handle = exec->get_cublas_handle(); - const dim3 grid_size( - ceildiv(hessenberg_iter->get_size()[1], default_dot_dim), - exec->get_num_computing_units() * 2); - const dim3 block_size(default_dot_dim, default_dot_dim); - auto next_krylov_basis = - krylov_bases->get_values() + - (iter + 1) * num_rows * hessenberg_iter->get_size()[1]; - for (size_type k = 0; k < iter + 1; ++k) { - const auto k_krylov_bases = - krylov_bases->get_const_values() + - k * num_rows * hessenberg_iter->get_size()[1]; - if (hessenberg_iter->get_size()[1] > 1) { - // TODO: this condition should be tuned - // single rhs will use vendor's dot, otherwise, use our own - // multidot_kernel which parallelize multiple rhs. - components::fill_array( - exec, hessenberg_iter->get_values() + k * stride_hessenberg, - hessenberg_iter->get_size()[1], zero()); - multidot_kernel(grid_size, block_size, 0, exec->get_queue(), k, - num_rows, hessenberg_iter->get_size()[1], - k_krylov_bases, next_krylov_basis, stride_krylov, - hessenberg_iter->get_values(), stride_hessenberg, - stop_status); - } else { - onemkl::dot(*exec->get_queue(), num_rows, k_krylov_bases, - stride_krylov, next_krylov_basis, stride_krylov, - hessenberg_iter->get_values() + k * stride_hessenberg); - } - update_next_krylov_kernel( - ceildiv(num_rows * stride_krylov, default_block_size), - default_block_size, 0, exec->get_queue(), k, num_rows, - hessenberg_iter->get_size()[1], k_krylov_bases, next_krylov_basis, - stride_krylov, hessenberg_iter->get_const_values(), - stride_hessenberg, stop_status); - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - - - update_hessenberg_2_kernel( - hessenberg_iter->get_size()[1], default_block_size, 0, - exec->get_queue(), iter, num_rows, hessenberg_iter->get_size()[1], - next_krylov_basis, stride_krylov, hessenberg_iter->get_values(), - stride_hessenberg, stop_status); - - update_krylov_kernel( - ceildiv(num_rows * stride_krylov, default_block_size), - default_block_size, 0, exec->get_queue(), iter, num_rows, - hessenberg_iter->get_size()[1], next_krylov_basis, stride_krylov, - hessenberg_iter->get_const_values(), stride_hessenberg, stop_status); - // next_krylov_basis /= hessenberg(iter, iter + 1) - // End of arnoldi -} - - -template -void givens_rotation(std::shared_ptr exec, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense* hessenberg_iter, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - size_type iter, const array* stop_status) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_cols = hessenberg_iter->get_size()[1]; - const dim3 block_dim{block_size, 1, 1}; - const dim3 grid_dim{ - static_cast(ceildiv(num_cols, block_size)), 1, 1}; - - givens_rotation_kernel( - grid_dim, block_dim, 0, exec->get_queue(), - hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], iter, - hessenberg_iter->get_values(), hessenberg_iter->get_stride(), - givens_sin->get_values(), givens_sin->get_stride(), - givens_cos->get_values(), givens_cos->get_stride(), - residual_norm->get_values(), residual_norm_collection->get_values(), - residual_norm_collection->get_stride(), stop_status->get_const_data()); -} - - -template -void step_1(std::shared_ptr exec, size_type num_rows, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense* krylov_bases, - matrix::Dense* hessenberg_iter, size_type iter, - array* final_iter_nums, - const array* stop_status) -{ - increase_final_iteration_numbers_kernel( - static_cast( - ceildiv(final_iter_nums->get_num_elems(), default_block_size)), - default_block_size, 0, exec->get_queue(), final_iter_nums->get_data(), - stop_status->get_const_data(), final_iter_nums->get_num_elems()); - finish_arnoldi(exec, num_rows, krylov_bases, hessenberg_iter, iter, - stop_status->get_const_data()); - givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter, - residual_norm, residual_norm_collection, iter, stop_status); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_1_KERNEL); - - -template -void solve_upper_triangular( - std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - const matrix::Dense* hessenberg, matrix::Dense* y, - const array* final_iter_nums) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_rhs = residual_norm_collection->get_size()[1]; - const dim3 block_dim{block_size, 1, 1}; - const dim3 grid_dim{static_cast(ceildiv(num_rhs, block_size)), - 1, 1}; - - solve_upper_triangular_kernel( - grid_dim, block_dim, 0, exec->get_queue(), hessenberg->get_size()[1], - num_rhs, residual_norm_collection->get_const_values(), - residual_norm_collection->get_stride(), hessenberg->get_const_values(), - hessenberg->get_stride(), y->get_values(), y->get_stride(), - final_iter_nums->get_const_data()); -} - - -template -void calculate_qy(std::shared_ptr exec, - const matrix::Dense* krylov_bases, - const matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - const auto num_rows = before_preconditioner->get_size()[0]; - const auto num_cols = krylov_bases->get_size()[1]; - const auto num_rhs = before_preconditioner->get_size()[1]; - const auto stride_before_preconditioner = - before_preconditioner->get_stride(); - - constexpr auto block_size = default_block_size; - const dim3 grid_dim{ - static_cast( - ceildiv(num_rows * stride_before_preconditioner, block_size)), - 1, 1}; - const dim3 block_dim{block_size, 1, 1}; - - - calculate_Qy_kernel( - grid_dim, block_dim, 0, exec->get_queue(), num_rows, num_cols, num_rhs, - krylov_bases->get_const_values(), krylov_bases->get_stride(), - y->get_const_values(), y->get_stride(), - before_preconditioner->get_values(), stride_before_preconditioner, - final_iter_nums->get_const_data()); - // Calculate qy - // before_preconditioner = krylov_bases * y -} - - -template -void step_2(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - const matrix::Dense* krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - solve_upper_triangular(exec, residual_norm_collection, hessenberg, y, - final_iter_nums); - calculate_qy(exec, krylov_bases, y, before_preconditioner, final_iter_nums); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_2_KERNEL); - - -} // namespace gmres -} // namespace dpcpp -} // namespace kernels -} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 35f19e77406..f85e0027ce1 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -38,7 +38,6 @@ set(GINKGO_HIP_SOURCES preconditioner/jacobi_kernels.hip.cpp preconditioner/jacobi_simple_apply_kernel.hip.cpp reorder/rcm_kernels.hip.cpp - solver/gmres_kernels.hip.cpp solver/cb_gmres_kernels.hip.cpp solver/idr_kernels.hip.cpp solver/lower_trs_kernels.hip.cpp diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp index 7d5c0a33303..fa594e4bb2d 100644 --- a/hip/base/kernel_launch.hip.hpp +++ b/hip/base/kernel_launch.hip.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "accessor/hip_helper.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/thread_ids.hip.hpp" @@ -49,6 +50,27 @@ namespace kernels { namespace hip { +template +struct to_device_type_impl&> { + using type = std::decay_t>()))>; + static type map_to_device(gko::acc::range& range) + { + return gko::acc::as_hip_range(range); + } +}; + +template +struct to_device_type_impl&> { + using type = std::decay_t>()))>; + static type map_to_device(const gko::acc::range& range) + { + return gko::acc::as_hip_range(range); + } +}; + + namespace device_std = thrust; diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp index 3bac14b3b13..c0182ceeb2f 100644 --- a/hip/solver/cb_gmres_kernels.hip.cpp +++ b/hip/solver/cb_gmres_kernels.hip.cpp @@ -91,12 +91,12 @@ void zero_matrix(size_type m, size_type n, size_type stride, ValueType* array) template -void initialize_1(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) +void initialize(std::shared_ptr exec, + const matrix::Dense* b, + matrix::Dense* residual, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + array* stop_status, size_type krylov_dim) { const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), krylov_dim * b->get_size()[1]); @@ -105,7 +105,7 @@ void initialize_1(std::shared_ptr exec, constexpr auto block_size = default_block_size; hipLaunchKernelGGL( - initialize_1_kernel, grid_dim, block_dim, 0, 0, + initialize_kernel, grid_dim, block_dim, 0, 0, b->get_size()[0], b->get_size()[1], krylov_dim, as_hip_type(b->get_const_values()), b->get_stride(), as_hip_type(residual->get_values()), residual->get_stride(), @@ -114,19 +114,19 @@ void initialize_1(std::shared_ptr exec, as_hip_type(stop_status->get_data())); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template -void initialize_2(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense>* arnoldi_norm, - Accessor3d krylov_bases, - matrix::Dense* next_krylov_basis, - array* final_iter_nums, array& tmp, - size_type krylov_dim) +void restart(std::shared_ptr exec, + const matrix::Dense* residual, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + matrix::Dense>* arnoldi_norm, + Accessor3d krylov_bases, + matrix::Dense* next_krylov_basis, + array* final_iter_nums, array& reduction_tmp, + size_type krylov_dim) { constexpr bool use_scalar = gko::cb_gmres::detail::has_3d_scaled_accessor::value; @@ -141,13 +141,13 @@ void initialize_2(std::shared_ptr exec, constexpr auto block_size = default_block_size; const auto stride_arnoldi = arnoldi_norm->get_stride(); - hipLaunchKernelGGL(initialize_2_1_kernel, grid_dim_1, block_dim, - 0, 0, residual->get_size()[0], residual->get_size()[1], + hipLaunchKernelGGL(restart_1_kernel, grid_dim_1, block_dim, 0, + 0, residual->get_size()[0], residual->get_size()[1], krylov_dim, acc::as_hip_range(krylov_bases), as_hip_type(residual_norm_collection->get_values()), residual_norm_collection->get_stride()); kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm, - tmp); + reduction_tmp); if (use_scalar) { components::fill_array(exec, @@ -177,8 +177,8 @@ void initialize_2(std::shared_ptr exec, const auto grid_dim_2 = ceildiv(std::max(num_rows, 1) * krylov_stride[1], default_block_size); - hipLaunchKernelGGL(initialize_2_2_kernel, grid_dim_2, block_dim, - 0, 0, residual->get_size()[0], residual->get_size()[1], + hipLaunchKernelGGL(restart_2_kernel, grid_dim_2, block_dim, 0, + 0, residual->get_size()[0], residual->get_size()[1], as_hip_type(residual->get_const_values()), residual->get_stride(), as_hip_type(residual_norm->get_const_values()), @@ -189,8 +189,7 @@ void initialize_2(std::shared_ptr exec, as_hip_type(final_iter_nums->get_data())); } -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE( - GKO_DECLARE_CB_GMRES_INITIALIZE_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); template @@ -408,18 +407,19 @@ void givens_rotation(std::shared_ptr exec, template -void step_1(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, array* final_iter_nums, - const array* stop_status, - array* reorth_status, array* num_reorth) +void arnoldi(std::shared_ptr exec, + matrix::Dense* next_krylov_basis, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, + matrix::Dense* buffer_iter, + matrix::Dense>* arnoldi_norm, + size_type iter, array* final_iter_nums, + const array* stop_status, + array* reorth_status, + array* num_reorth) { hipLaunchKernelGGL( increase_final_iteration_numbers_kernel, @@ -436,7 +436,7 @@ void step_1(std::shared_ptr exec, residual_norm, residual_norm_collection, iter, stop_status); } -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL); template @@ -492,13 +492,13 @@ void calculate_qy(ConstAccessor3d krylov_bases, size_type num_krylov_bases, template -void step_2(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - ConstAccessor3d krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) +void solve_krylov(std::shared_ptr exec, + const matrix::Dense* residual_norm_collection, + ConstAccessor3d krylov_bases, + const matrix::Dense* hessenberg, + matrix::Dense* y, + matrix::Dense* before_preconditioner, + const array* final_iter_nums) { if (before_preconditioner->get_size()[1] == 0) { return; @@ -516,7 +516,7 @@ void step_2(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE( - GKO_DECLARE_CB_GMRES_STEP_2_KERNEL); + GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL); } // namespace cb_gmres diff --git a/hip/solver/gmres_kernels.hip.cpp b/hip/solver/gmres_kernels.hip.cpp deleted file mode 100644 index 0eede6387f6..00000000000 --- a/hip/solver/gmres_kernels.hip.cpp +++ /dev/null @@ -1,352 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include "core/solver/gmres_kernels.hpp" - - -#include - - -#include - - -#include -#include -#include -#include - - -#include "core/components/fill_array_kernels.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The GMRES solver namespace. - * - * @ingroup gmres - */ -namespace gmres { - - -constexpr int default_block_size = 512; -// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block -// size limit. -constexpr int default_dot_dim = 32; -constexpr int default_dot_size = default_dot_dim * default_dot_dim; - - -#include "common/cuda_hip/solver/gmres_kernels.hpp.inc" - - -template -void initialize_1(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) -{ - const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), - krylov_dim * b->get_size()[1]); - const auto grid_dim = ceildiv(num_threads, default_block_size); - const auto block_dim = default_block_size; - constexpr auto block_size = default_block_size; - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(initialize_1_kernel), grid_dim, block_dim, - 0, 0, b->get_size()[0], b->get_size()[1], krylov_dim, - as_hip_type(b->get_const_values()), b->get_stride(), - as_hip_type(residual->get_values()), residual->get_stride(), - as_hip_type(givens_sin->get_values()), givens_sin->get_stride(), - as_hip_type(givens_cos->get_values()), givens_cos->get_stride(), - as_hip_type(stop_status->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL); - - -template -void initialize_2(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense* krylov_bases, - array* final_iter_nums, array& tmp, - size_type krylov_dim) -{ - const auto num_rows = residual->get_size()[0]; - const auto num_rhs = residual->get_size()[1]; - const auto grid_dim_1 = - ceildiv(krylov_bases->get_size()[0] * krylov_bases->get_stride(), - default_block_size); - const auto block_dim = default_block_size; - constexpr auto block_size = default_block_size; - - kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm, - tmp); - - const auto grid_dim_2 = - ceildiv(std::max(num_rows, 1) * num_rhs, default_block_size); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(initialize_2_2_kernel), grid_dim_2, - block_dim, 0, 0, residual->get_size()[0], residual->get_size()[1], - as_hip_type(residual->get_const_values()), residual->get_stride(), - as_hip_type(residual_norm->get_const_values()), - as_hip_type(residual_norm_collection->get_values()), - as_hip_type(krylov_bases->get_values()), krylov_bases->get_stride(), - as_hip_type(final_iter_nums->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL); - - -template -void finish_arnoldi(std::shared_ptr exec, size_type num_rows, - matrix::Dense* krylov_bases, - matrix::Dense* hessenberg_iter, size_type iter, - const stopping_status* stop_status) -{ - if (hessenberg_iter->get_size()[1] == 0) { - return; - } - const auto stride_krylov = krylov_bases->get_stride(); - const auto stride_hessenberg = hessenberg_iter->get_stride(); - auto hipblas_handle = exec->get_hipblas_handle(); - const dim3 grid_size( - ceildiv(hessenberg_iter->get_size()[1], default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 block_size(default_dot_dim, default_dot_dim); - auto next_krylov_basis = - krylov_bases->get_values() + - (iter + 1) * num_rows * hessenberg_iter->get_size()[1]; - for (size_type k = 0; k < iter + 1; ++k) { - const auto k_krylov_bases = - krylov_bases->get_const_values() + - k * num_rows * hessenberg_iter->get_size()[1]; - if (hessenberg_iter->get_size()[1] > 1) { - // TODO: this condition should be tuned - // single rhs will use vendor's dot, otherwise, use our own - // multidot_kernel which parallelize multiple rhs. - components::fill_array( - exec, hessenberg_iter->get_values() + k * stride_hessenberg, - hessenberg_iter->get_size()[1], zero()); - hipLaunchKernelGGL(multidot_kernel, grid_size, block_size, 0, 0, k, - num_rows, hessenberg_iter->get_size()[1], - as_hip_type(k_krylov_bases), - as_hip_type(next_krylov_basis), stride_krylov, - as_hip_type(hessenberg_iter->get_values()), - stride_hessenberg, as_hip_type(stop_status)); - } else { - hipblas::dot(exec->get_hipblas_handle(), num_rows, k_krylov_bases, - stride_krylov, next_krylov_basis, stride_krylov, - hessenberg_iter->get_values() + k * stride_hessenberg); - } - hipLaunchKernelGGL( - HIP_KERNEL_NAME(update_next_krylov_kernel), - ceildiv(num_rows * stride_krylov, default_block_size), - default_block_size, 0, 0, k, num_rows, - hessenberg_iter->get_size()[1], as_hip_type(k_krylov_bases), - as_hip_type(next_krylov_basis), stride_krylov, - as_hip_type(hessenberg_iter->get_const_values()), stride_hessenberg, - as_hip_type(stop_status)); - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(update_hessenberg_2_kernel), - hessenberg_iter->get_size()[1], default_block_size, 0, 0, iter, - num_rows, hessenberg_iter->get_size()[1], - as_hip_type(next_krylov_basis), stride_krylov, - as_hip_type(hessenberg_iter->get_values()), stride_hessenberg, - as_hip_type(stop_status)); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(update_krylov_kernel), - ceildiv(num_rows * stride_krylov, default_block_size), - default_block_size, 0, 0, iter, num_rows, - hessenberg_iter->get_size()[1], as_hip_type(next_krylov_basis), - stride_krylov, as_hip_type(hessenberg_iter->get_const_values()), - stride_hessenberg, as_hip_type(stop_status)); - // next_krylov_basis /= hessenberg(iter, iter + 1) - // End of arnoldi -} - - -template -void givens_rotation(std::shared_ptr exec, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense* hessenberg_iter, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - size_type iter, const array* stop_status) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_cols = hessenberg_iter->get_size()[1]; - const auto block_dim = block_size; - const auto grid_dim = - static_cast(ceildiv(num_cols, block_size)); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(givens_rotation_kernel), grid_dim, - block_dim, 0, 0, hessenberg_iter->get_size()[0], - hessenberg_iter->get_size()[1], iter, - as_hip_type(hessenberg_iter->get_values()), - hessenberg_iter->get_stride(), as_hip_type(givens_sin->get_values()), - givens_sin->get_stride(), as_hip_type(givens_cos->get_values()), - givens_cos->get_stride(), as_hip_type(residual_norm->get_values()), - as_hip_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride(), - as_hip_type(stop_status->get_const_data())); -} - - -template -void step_1(std::shared_ptr exec, size_type num_rows, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense* krylov_bases, - matrix::Dense* hessenberg_iter, size_type iter, - array* final_iter_nums, - const array* stop_status) -{ - hipLaunchKernelGGL( - increase_final_iteration_numbers_kernel, - static_cast( - ceildiv(final_iter_nums->get_num_elems(), default_block_size)), - default_block_size, 0, 0, as_hip_type(final_iter_nums->get_data()), - as_hip_type(stop_status->get_const_data()), - final_iter_nums->get_num_elems()); - finish_arnoldi(exec, num_rows, krylov_bases, hessenberg_iter, iter, - stop_status->get_const_data()); - givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter, - residual_norm, residual_norm_collection, iter, stop_status); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_1_KERNEL); - - -template -void solve_upper_triangular( - const matrix::Dense* residual_norm_collection, - const matrix::Dense* hessenberg, matrix::Dense* y, - const array* final_iter_nums) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_rhs = residual_norm_collection->get_size()[1]; - const auto block_dim = block_size; - const auto grid_dim = - static_cast(ceildiv(num_rhs, block_size)); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME(solve_upper_triangular_kernel), grid_dim, - block_dim, 0, 0, hessenberg->get_size()[1], num_rhs, - as_hip_type(residual_norm_collection->get_const_values()), - residual_norm_collection->get_stride(), - as_hip_type(hessenberg->get_const_values()), hessenberg->get_stride(), - as_hip_type(y->get_values()), y->get_stride(), - as_hip_type(final_iter_nums->get_const_data())); -} - - -template -void calculate_qy(const matrix::Dense* krylov_bases, - const matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - const auto num_rows = before_preconditioner->get_size()[0]; - const auto num_cols = krylov_bases->get_size()[1]; - const auto num_rhs = before_preconditioner->get_size()[1]; - const auto stride_before_preconditioner = - before_preconditioner->get_stride(); - - constexpr auto block_size = default_block_size; - const auto grid_dim = static_cast( - ceildiv(num_rows * stride_before_preconditioner, block_size)); - const auto block_dim = block_size; - - - hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_Qy_kernel), - grid_dim, block_dim, 0, 0, num_rows, num_cols, num_rhs, - as_hip_type(krylov_bases->get_const_values()), - krylov_bases->get_stride(), - as_hip_type(y->get_const_values()), y->get_stride(), - as_hip_type(before_preconditioner->get_values()), - stride_before_preconditioner, - as_hip_type(final_iter_nums->get_const_data())); - // Calculate qy - // before_preconditioner = krylov_bases * y -} - - -template -void step_2(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - const matrix::Dense* krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - solve_upper_triangular(residual_norm_collection, hessenberg, y, - final_iter_nums); - calculate_qy(krylov_bases, y, before_preconditioner, final_iter_nums); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_2_KERNEL); - - -} // namespace gmres -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp index c880d2e6f99..2cb362898c7 100644 --- a/include/ginkgo/core/solver/gmres.hpp +++ b/include/ginkgo/core/solver/gmres.hpp @@ -206,6 +206,8 @@ struct workspace_traits> { constexpr static int one = 11; // constant -1.0 scalar constexpr static int minus_one = 12; + // temporary norm vector of next_krylov to copy into hessenberg matrix + constexpr static int next_krylov_norm_tmp = 13; // stopping status array constexpr static int stop = 0; diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index d7e88b57c86..2f8c38f5946 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -31,7 +31,6 @@ target_sources(ginkgo_omp preconditioner/isai_kernels.cpp preconditioner/jacobi_kernels.cpp reorder/rcm_kernels.cpp - solver/gmres_kernels.cpp solver/cb_gmres_kernels.cpp solver/idr_kernels.cpp solver/lower_trs_kernels.cpp diff --git a/omp/solver/cb_gmres_kernels.cpp b/omp/solver/cb_gmres_kernels.cpp index 00a9757bdd9..c8b6a1396aa 100644 --- a/omp/solver/cb_gmres_kernels.cpp +++ b/omp/solver/cb_gmres_kernels.cpp @@ -318,12 +318,12 @@ void calculate_qy(ConstAccessor3d krylov_bases, template -void initialize_1(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) +void initialize(std::shared_ptr exec, + const matrix::Dense* b, + matrix::Dense* residual, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + array* stop_status, size_type krylov_dim) { using rc_vtype = remove_complex; @@ -342,19 +342,19 @@ void initialize_1(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template -void initialize_2(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense>* arnoldi_norm, - Accessor3d krylov_bases, - matrix::Dense* next_krylov_basis, - array* final_iter_nums, array&, - size_type krylov_dim) +void restart(std::shared_ptr exec, + const matrix::Dense* residual, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + matrix::Dense>* arnoldi_norm, + Accessor3d krylov_bases, + matrix::Dense* next_krylov_basis, + array* final_iter_nums, array&, + size_type krylov_dim) { using rc_vtype = remove_complex; constexpr bool has_scalar = @@ -417,23 +417,22 @@ void initialize_2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE( - GKO_DECLARE_CB_GMRES_INITIALIZE_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); template -void step_1(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, array* final_iter_nums, - const array* stop_status, array*, - array*) +void arnoldi(std::shared_ptr exec, + matrix::Dense* next_krylov_basis, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, + matrix::Dense* buffer_iter, + matrix::Dense>* arnoldi_norm, + size_type iter, array* final_iter_nums, + const array* stop_status, array*, + array*) { #pragma omp parallel for for (size_type i = 0; i < final_iter_nums->get_num_elems(); ++i) { @@ -451,17 +450,17 @@ void step_1(std::shared_ptr exec, stop_status->get_const_data()); } -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL); template -void step_2(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - ConstAccessor3d krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) +void solve_krylov(std::shared_ptr exec, + const matrix::Dense* residual_norm_collection, + ConstAccessor3d krylov_bases, + const matrix::Dense* hessenberg, + matrix::Dense* y, + matrix::Dense* before_preconditioner, + const array* final_iter_nums) { solve_upper_triangular(residual_norm_collection, hessenberg, y, final_iter_nums->get_const_data()); @@ -470,7 +469,7 @@ void step_2(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE( - GKO_DECLARE_CB_GMRES_STEP_2_KERNEL); + GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL); } // namespace cb_gmres diff --git a/omp/solver/gmres_kernels.cpp b/omp/solver/gmres_kernels.cpp deleted file mode 100644 index 97d97c384c1..00000000000 --- a/omp/solver/gmres_kernels.cpp +++ /dev/null @@ -1,356 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include "core/solver/gmres_kernels.hpp" - - -#include - - -#include -#include -#include -#include -#include - - -namespace gko { -namespace kernels { -namespace omp { -/** - * @brief The GMRES solver namespace. - * - * @ingroup gmres - */ -namespace gmres { - - -namespace { - - -template -void finish_arnoldi(size_type num_rows, matrix::Dense* krylov_bases, - matrix::Dense* hessenberg_iter, size_type iter, - const stopping_status* stop_status) -{ - const auto krylov_bases_rowoffset = num_rows; - const auto next_krylov_rowoffset = (iter + 1) * krylov_bases_rowoffset; -#pragma omp declare reduction(add:ValueType : omp_out = omp_out + omp_in) - for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) { - if (stop_status[i].has_stopped()) { - continue; - } - for (size_type k = 0; k < iter + 1; ++k) { - ValueType hessenberg_iter_entry = zero(); - -#pragma omp parallel for reduction(add : hessenberg_iter_entry) - for (size_type j = 0; j < num_rows; ++j) { - hessenberg_iter_entry += - krylov_bases->at(j + next_krylov_rowoffset, i) * - krylov_bases->at(j + k * krylov_bases_rowoffset, i); - } - hessenberg_iter->at(k, i) = hessenberg_iter_entry; -#pragma omp parallel for - for (size_type j = 0; j < num_rows; ++j) { - krylov_bases->at(j + next_krylov_rowoffset, i) -= - hessenberg_iter->at(k, i) * - krylov_bases->at(j + k * krylov_bases_rowoffset, i); - } - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - - ValueType hessenberg_iter_entry = zero(); - -#pragma omp parallel for reduction(add : hessenberg_iter_entry) - for (size_type j = 0; j < num_rows; ++j) { - hessenberg_iter_entry += - krylov_bases->at(j + next_krylov_rowoffset, i) * - krylov_bases->at(j + next_krylov_rowoffset, i); - } - hessenberg_iter->at(iter + 1, i) = sqrt(hessenberg_iter_entry); -// hessenberg(iter + 1, iter) = norm(krylov_bases) -#pragma omp parallel for - for (size_type j = 0; j < num_rows; ++j) { - krylov_bases->at(j + next_krylov_rowoffset, i) /= - hessenberg_iter->at(iter + 1, i); - } - // next_krylov_basis /= hessenberg(iter, iter + 1) - // End of arnoldi - } -} - - -template -void calculate_sin_and_cos(matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense* hessenberg_iter, - size_type iter, const size_type rhs) -{ - if (is_zero(hessenberg_iter->at(iter, rhs))) { - givens_cos->at(iter, rhs) = zero(); - givens_sin->at(iter, rhs) = one(); - } else { - auto this_hess = hessenberg_iter->at(iter, rhs); - auto next_hess = hessenberg_iter->at(iter + 1, rhs); - const auto scale = abs(this_hess) + abs(next_hess); - const auto hypotenuse = - scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) + - abs(next_hess / scale) * abs(next_hess / scale)); - givens_cos->at(iter, rhs) = conj(this_hess) / hypotenuse; - givens_sin->at(iter, rhs) = conj(next_hess) / hypotenuse; - } -} - - -template -void givens_rotation(matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense* hessenberg_iter, size_type iter, - const stopping_status* stop_status) -{ -#pragma omp parallel for - for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) { - if (stop_status[i].has_stopped()) { - continue; - } - for (size_type j = 0; j < iter; ++j) { - auto temp = givens_cos->at(j, i) * hessenberg_iter->at(j, i) + - givens_sin->at(j, i) * hessenberg_iter->at(j + 1, i); - hessenberg_iter->at(j + 1, i) = - -conj(givens_sin->at(j, i)) * hessenberg_iter->at(j, i) + - conj(givens_cos->at(j, i)) * hessenberg_iter->at(j + 1, i); - hessenberg_iter->at(j, i) = temp; - // temp = cos(j)*hessenberg(j) + - // sin(j)*hessenberg(j+1) - // hessenberg(j+1) = -conj(sin(j))*hessenberg(j) + - // conj(cos(j))*hessenberg(j+1) - // hessenberg(j) = temp; - } - - calculate_sin_and_cos(givens_sin, givens_cos, hessenberg_iter, iter, i); - - hessenberg_iter->at(iter, i) = - givens_cos->at(iter, i) * hessenberg_iter->at(iter, i) + - givens_sin->at(iter, i) * hessenberg_iter->at(iter + 1, i); - hessenberg_iter->at(iter + 1, i) = zero(); - // hessenberg(iter) = cos(iter)*hessenberg(iter) + - // sin(iter)*hessenberg(iter + 1) - // hessenberg(iter+1) = 0 - } -} - - -template -void calculate_next_residual_norm( - matrix::Dense* givens_sin, matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, size_type iter, - const stopping_status* stop_status) -{ -#pragma omp parallel for - for (size_type i = 0; i < residual_norm->get_size()[1]; ++i) { - if (stop_status[i].has_stopped()) { - continue; - } - residual_norm_collection->at(iter + 1, i) = - -conj(givens_sin)->at(iter, i) * - residual_norm_collection->at(iter, i); - residual_norm_collection->at(iter, i) = - givens_cos->at(iter, i) * residual_norm_collection->at(iter, i); - residual_norm->at(0, i) = - abs(residual_norm_collection->at(iter + 1, i)); - } -} - - -template -void solve_upper_triangular( - const matrix::Dense* residual_norm_collection, - const matrix::Dense* hessenberg, matrix::Dense* y, - const size_type* final_iter_nums) -{ -#pragma omp parallel for - for (size_type k = 0; k < residual_norm_collection->get_size()[1]; ++k) { - for (int i = final_iter_nums[k] - 1; i >= 0; --i) { - auto temp = residual_norm_collection->at(i, k); - for (size_type j = i + 1; j < final_iter_nums[k]; ++j) { - temp -= - hessenberg->at( - i, j * residual_norm_collection->get_size()[1] + k) * - y->at(j, k); - } - y->at(i, k) = - temp / hessenberg->at( - i, i * residual_norm_collection->get_size()[1] + k); - } - } -} - - -template -void calculate_qy(const matrix::Dense* krylov_bases, - const matrix::Dense* y, - matrix::Dense* before_preconditioner, - const size_type* final_iter_nums) -{ - const auto krylov_bases_rowoffset = before_preconditioner->get_size()[0]; -#pragma omp parallel for - for (size_type i = 0; i < before_preconditioner->get_size()[0]; ++i) { - for (size_type k = 0; k < before_preconditioner->get_size()[1]; ++k) { - before_preconditioner->at(i, k) = zero(); - for (size_type j = 0; j < final_iter_nums[k]; ++j) { - before_preconditioner->at(i, k) += - krylov_bases->at(i + j * krylov_bases_rowoffset, k) * - y->at(j, k); - } - } - } -} - - -} // namespace - - -template -void initialize_1(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) -{ - using norm_type = remove_complex; - for (size_type j = 0; j < b->get_size()[1]; ++j) { -#pragma omp parallel for - for (size_type i = 0; i < b->get_size()[0]; ++i) { - residual->at(i, j) = b->at(i, j); - } - -#pragma omp parallel for - for (size_type i = 0; i < krylov_dim; ++i) { - givens_sin->at(i, j) = zero(); - givens_cos->at(i, j) = zero(); - } - stop_status->get_data()[j].reset(); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL); - - -template -void initialize_2(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense* krylov_bases, - array* final_iter_nums, array&, - size_type krylov_dim) -{ - using norm_type = remove_complex; - for (size_type j = 0; j < residual->get_size()[1]; ++j) { - // Calculate residual norm - norm_type res_norm = zero(); -#pragma omp declare reduction(add:norm_type : omp_out = omp_out + omp_in) - -#pragma omp parallel for reduction(add : res_norm) - for (size_type i = 0; i < residual->get_size()[0]; ++i) { - res_norm += squared_norm(residual->at(i, j)); - } - residual_norm->at(0, j) = sqrt(res_norm); - residual_norm_collection->at(0, j) = residual_norm->at(0, j); -#pragma omp parallel for - for (size_type i = 0; i < residual->get_size()[0]; ++i) { - krylov_bases->at(i, j) = - residual->at(i, j) / residual_norm->at(0, j); - } - final_iter_nums->get_data()[j] = 0; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL); - - -template -void step_1(std::shared_ptr exec, size_type num_rows, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense* krylov_bases, - matrix::Dense* hessenberg_iter, size_type iter, - array* final_iter_nums, - const array* stop_status) -{ -#pragma omp parallel for - for (size_type i = 0; i < final_iter_nums->get_num_elems(); ++i) { - final_iter_nums->get_data()[i] += - (1 - stop_status->get_const_data()[i].has_stopped()); - } - - finish_arnoldi(num_rows, krylov_bases, hessenberg_iter, iter, - stop_status->get_const_data()); - givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter, - stop_status->get_const_data()); - calculate_next_residual_norm(givens_sin, givens_cos, residual_norm, - residual_norm_collection, iter, - stop_status->get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_1_KERNEL); - - -template -void step_2(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - const matrix::Dense* krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - solve_upper_triangular(residual_norm_collection, hessenberg, y, - final_iter_nums->get_const_data()); - calculate_qy(krylov_bases, y, before_preconditioner, - final_iter_nums->get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_2_KERNEL); - - -} // namespace gmres -} // namespace omp -} // namespace kernels -} // namespace gko diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index a18dbc49296..03a6463baf7 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -44,6 +44,7 @@ target_sources(ginkgo_reference solver/fcg_kernels.cpp solver/gmres_kernels.cpp solver/cb_gmres_kernels.cpp + solver/common_gmres_kernels.cpp solver/idr_kernels.cpp solver/ir_kernels.cpp solver/lower_trs_kernels.cpp diff --git a/reference/solver/cb_gmres_kernels.cpp b/reference/solver/cb_gmres_kernels.cpp index a8c376fb123..717a44a1a44 100644 --- a/reference/solver/cb_gmres_kernels.cpp +++ b/reference/solver/cb_gmres_kernels.cpp @@ -306,12 +306,12 @@ void calculate_qy(ConstAccessor3d krylov_bases, template -void initialize_1(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) +void initialize(std::shared_ptr exec, + const matrix::Dense* b, + matrix::Dense* residual, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + array* stop_status, size_type krylov_dim) { for (size_type j = 0; j < b->get_size()[1]; ++j) { for (size_type i = 0; i < b->get_size()[0]; ++i) { @@ -325,19 +325,19 @@ void initialize_1(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template -void initialize_2(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense>* arnoldi_norm, - Accessor3d krylov_bases, - matrix::Dense* next_krylov_basis, - array* final_iter_nums, array&, - size_type krylov_dim) +void restart(std::shared_ptr exec, + const matrix::Dense* residual, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + matrix::Dense>* arnoldi_norm, + Accessor3d krylov_bases, + matrix::Dense* next_krylov_basis, + array* final_iter_nums, array&, + size_type krylov_dim) { static_assert( std::is_same exec, } } -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE( - GKO_DECLARE_CB_GMRES_INITIALIZE_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); template -void step_1(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, array* final_iter_nums, - const array* stop_status, array*, - array*) +void arnoldi(std::shared_ptr exec, + matrix::Dense* next_krylov_basis, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, + matrix::Dense* buffer_iter, + matrix::Dense>* arnoldi_norm, + size_type iter, array* final_iter_nums, + const array* stop_status, array*, + array*) { static_assert( std::is_same exec, stop_status->get_const_data()); } -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_STEP_1_KERNEL); +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL); template -void step_2(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - ConstAccessor3d krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) +void solve_krylov(std::shared_ptr exec, + const matrix::Dense* residual_norm_collection, + ConstAccessor3d krylov_bases, + const matrix::Dense* hessenberg, + matrix::Dense* y, + matrix::Dense* before_preconditioner, + const array* final_iter_nums) { solve_upper_triangular(residual_norm_collection, hessenberg, y, final_iter_nums->get_const_data()); @@ -450,7 +449,7 @@ void step_2(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE( - GKO_DECLARE_CB_GMRES_STEP_2_KERNEL); + GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL); } // namespace cb_gmres diff --git a/reference/solver/common_gmres_kernels.cpp b/reference/solver/common_gmres_kernels.cpp new file mode 100644 index 00000000000..8169df85f40 --- /dev/null +++ b/reference/solver/common_gmres_kernels.cpp @@ -0,0 +1,226 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/solver/common_gmres_kernels.hpp" + + +#include +#include +#include +#include +#include +#include + + +#include "core/solver/cb_gmres_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +/** + * @brief The common GMRES solver namespace. + * + * @ingroup gmres + */ +namespace common_gmres { +namespace { + + +template +void calculate_sin_and_cos(matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense* hessenberg_iter, + size_type iter, const size_type rhs) +{ + if (is_zero(hessenberg_iter->at(iter, rhs))) { + givens_cos->at(iter, rhs) = zero(); + givens_sin->at(iter, rhs) = one(); + } else { + auto this_hess = hessenberg_iter->at(iter, rhs); + auto next_hess = hessenberg_iter->at(iter + 1, rhs); + const auto scale = abs(this_hess) + abs(next_hess); + const auto hypotenuse = + scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) + + abs(next_hess / scale) * abs(next_hess / scale)); + givens_cos->at(iter, rhs) = conj(this_hess) / hypotenuse; + givens_sin->at(iter, rhs) = conj(next_hess) / hypotenuse; + } +} + + +template +void givens_rotation(matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense* hessenberg_iter, size_type iter, + const stopping_status* stop_status) +{ + for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) { + if (stop_status[i].has_stopped()) { + continue; + } + for (size_type j = 0; j < iter; ++j) { + auto temp = givens_cos->at(j, i) * hessenberg_iter->at(j, i) + + givens_sin->at(j, i) * hessenberg_iter->at(j + 1, i); + hessenberg_iter->at(j + 1, i) = + -conj(givens_sin->at(j, i)) * hessenberg_iter->at(j, i) + + conj(givens_cos->at(j, i)) * hessenberg_iter->at(j + 1, i); + hessenberg_iter->at(j, i) = temp; + // temp = cos(j)*hessenberg(j) + + // sin(j)*hessenberg(j+1) + // hessenberg(j+1) = -conj(sin(j))*hessenberg(j) + + // conj(cos(j))*hessenberg(j+1) + // hessenberg(j) = temp; + } + + calculate_sin_and_cos(givens_sin, givens_cos, hessenberg_iter, iter, i); + + hessenberg_iter->at(iter, i) = + givens_cos->at(iter, i) * hessenberg_iter->at(iter, i) + + givens_sin->at(iter, i) * hessenberg_iter->at(iter + 1, i); + hessenberg_iter->at(iter + 1, i) = zero(); + // hessenberg(iter) = cos(iter)*hessenberg(iter) + + // sin(iter)*hessenberg(iter + 1) + // hessenberg(iter+1) = 0 + } +} + + +template +void calculate_next_residual_norm( + matrix::Dense* givens_sin, matrix::Dense* givens_cos, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, size_type iter, + const stopping_status* stop_status) +{ + for (size_type i = 0; i < residual_norm->get_size()[1]; ++i) { + if (stop_status[i].has_stopped()) { + continue; + } + residual_norm_collection->at(iter + 1, i) = + -conj(givens_sin->at(iter, i)) * + residual_norm_collection->at(iter, i); + residual_norm_collection->at(iter, i) = + givens_cos->at(iter, i) * residual_norm_collection->at(iter, i); + residual_norm->at(0, i) = + abs(residual_norm_collection->at(iter + 1, i)); + } +} + + +} // namespace + + +template +void initialize(std::shared_ptr exec, + const matrix::Dense* b, + matrix::Dense* residual, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + stopping_status* stop_status) +{ + const auto krylov_dim = givens_sin->get_size()[0]; + using NormValueType = remove_complex; + for (size_type j = 0; j < b->get_size()[1]; ++j) { + for (size_type i = 0; i < b->get_size()[0]; ++i) { + residual->at(i, j) = b->at(i, j); + } + for (size_type i = 0; i < krylov_dim; ++i) { + givens_sin->at(i, j) = zero(); + givens_cos->at(i, j) = zero(); + } + stop_status[j].reset(); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_COMMON_GMRES_INITIALIZE_KERNEL); + + +template +void hessenberg_qr(std::shared_ptr exec, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + matrix::Dense* hessenberg_iter, size_type iter, + size_type* final_iter_nums, + const stopping_status* stop_status) +{ + for (size_type i = 0; i < givens_sin->get_size()[1]; ++i) { + if (!stop_status[i].has_stopped()) { + final_iter_nums[i]++; + } + } + + givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter, stop_status); + calculate_next_residual_norm(givens_sin, givens_cos, residual_norm, + residual_norm_collection, iter, stop_status); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_COMMON_GMRES_HESSENBERG_QR_KERNEL); + + +template +void solve_krylov(std::shared_ptr exec, + const matrix::Dense* residual_norm_collection, + const matrix::Dense* hessenberg, + matrix::Dense* y, const size_type* final_iter_nums, + const stopping_status* stop_status) +{ + for (size_type k = 0; k < residual_norm_collection->get_size()[1]; ++k) { + if (stop_status[k].is_finalized()) { + continue; + } + for (int i = final_iter_nums[k] - 1; i >= 0; --i) { + auto temp = residual_norm_collection->at(i, k); + for (size_type j = i + 1; j < final_iter_nums[k]; ++j) { + temp -= + hessenberg->at( + i, j * residual_norm_collection->get_size()[1] + k) * + y->at(j, k); + } + y->at(i, k) = + temp / hessenberg->at( + i, i * residual_norm_collection->get_size()[1] + k); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_COMMON_GMRES_SOLVE_KRYLOV_KERNEL); + + +} // namespace common_gmres +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp index 9bb1ac99a5d..5e46275f5ce 100644 --- a/reference/solver/gmres_kernels.cpp +++ b/reference/solver/gmres_kernels.cpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include namespace gko { @@ -51,169 +52,38 @@ namespace reference { namespace gmres { -namespace { - - -template -void finish_arnoldi(size_type num_rows, matrix::Dense* krylov_bases, - matrix::Dense* hessenberg_iter, size_type iter, - const stopping_status* stop_status) -{ - const auto krylov_bases_rowoffset = num_rows; - const auto next_krylov_rowoffset = (iter + 1) * krylov_bases_rowoffset; - for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) { - if (stop_status[i].has_stopped()) { - continue; - } - for (size_type k = 0; k < iter + 1; ++k) { - hessenberg_iter->at(k, i) = 0; - for (size_type j = 0; j < num_rows; ++j) { - hessenberg_iter->at(k, i) += - krylov_bases->at(j + next_krylov_rowoffset, i) * - conj(krylov_bases->at(j + k * krylov_bases_rowoffset, i)); - } - for (size_type j = 0; j < num_rows; ++j) { - krylov_bases->at(j + next_krylov_rowoffset, i) -= - hessenberg_iter->at(k, i) * - krylov_bases->at(j + k * krylov_bases_rowoffset, i); - } - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - - hessenberg_iter->at(iter + 1, i) = 0; - for (size_type j = 0; j < num_rows; ++j) { - hessenberg_iter->at(iter + 1, i) += - krylov_bases->at(j + next_krylov_rowoffset, i) * - krylov_bases->at(j + next_krylov_rowoffset, i); - } - hessenberg_iter->at(iter + 1, i) = - sqrt(hessenberg_iter->at(iter + 1, i)); - // hessenberg(iter + 1, iter) = norm(krylov_bases) - for (size_type j = 0; j < num_rows; ++j) { - krylov_bases->at(j + next_krylov_rowoffset, i) /= - hessenberg_iter->at(iter + 1, i); - } - // next_krylov_basis /= hessenberg(iter, iter + 1) - // End of arnoldi - } -} - - -template -void calculate_sin_and_cos(matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense* hessenberg_iter, - size_type iter, const size_type rhs) -{ - if (is_zero(hessenberg_iter->at(iter, rhs))) { - givens_cos->at(iter, rhs) = zero(); - givens_sin->at(iter, rhs) = one(); - } else { - auto this_hess = hessenberg_iter->at(iter, rhs); - auto next_hess = hessenberg_iter->at(iter + 1, rhs); - const auto scale = abs(this_hess) + abs(next_hess); - const auto hypotenuse = - scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) + - abs(next_hess / scale) * abs(next_hess / scale)); - givens_cos->at(iter, rhs) = conj(this_hess) / hypotenuse; - givens_sin->at(iter, rhs) = conj(next_hess) / hypotenuse; - } -} - - template -void givens_rotation(matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense* hessenberg_iter, size_type iter, - const stopping_status* stop_status) +void restart(std::shared_ptr exec, + const matrix::Dense* residual, + const matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + matrix::Dense* krylov_bases, size_type* final_iter_nums) { - for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) { - if (stop_status[i].has_stopped()) { - continue; - } - for (size_type j = 0; j < iter; ++j) { - auto temp = givens_cos->at(j, i) * hessenberg_iter->at(j, i) + - givens_sin->at(j, i) * hessenberg_iter->at(j + 1, i); - hessenberg_iter->at(j + 1, i) = - -conj(givens_sin->at(j, i)) * hessenberg_iter->at(j, i) + - conj(givens_cos->at(j, i)) * hessenberg_iter->at(j + 1, i); - hessenberg_iter->at(j, i) = temp; - // temp = cos(j)*hessenberg(j) + - // sin(j)*hessenberg(j+1) - // hessenberg(j+1) = -conj(sin(j))*hessenberg(j) + - // conj(cos(j))*hessenberg(j+1) - // hessenberg(j) = temp; + for (size_type j = 0; j < residual->get_size()[1]; ++j) { + residual_norm_collection->at(0, j) = residual_norm->at(0, j); + for (size_type i = 0; i < residual->get_size()[0]; ++i) { + krylov_bases->at(i, j) = + residual->at(i, j) / residual_norm->at(0, j); } - - calculate_sin_and_cos(givens_sin, givens_cos, hessenberg_iter, iter, i); - - hessenberg_iter->at(iter, i) = - givens_cos->at(iter, i) * hessenberg_iter->at(iter, i) + - givens_sin->at(iter, i) * hessenberg_iter->at(iter + 1, i); - hessenberg_iter->at(iter + 1, i) = zero(); - // hessenberg(iter) = cos(iter)*hessenberg(iter) + - // sin(iter)*hessenberg(iter + 1) - // hessenberg(iter+1) = 0 + final_iter_nums[j] = 0; } } - -template -void calculate_next_residual_norm( - matrix::Dense* givens_sin, matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, size_type iter, - const stopping_status* stop_status) -{ - for (size_type i = 0; i < residual_norm->get_size()[1]; ++i) { - if (stop_status[i].has_stopped()) { - continue; - } - residual_norm_collection->at(iter + 1, i) = - -conj(givens_sin->at(iter, i)) * - residual_norm_collection->at(iter, i); - residual_norm_collection->at(iter, i) = - givens_cos->at(iter, i) * residual_norm_collection->at(iter, i); - residual_norm->at(0, i) = - abs(residual_norm_collection->at(iter + 1, i)); - } -} +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL); template -void solve_upper_triangular( - const matrix::Dense* residual_norm_collection, - const matrix::Dense* hessenberg, matrix::Dense* y, - const size_type* final_iter_nums) -{ - for (size_type k = 0; k < residual_norm_collection->get_size()[1]; ++k) { - for (int i = final_iter_nums[k] - 1; i >= 0; --i) { - auto temp = residual_norm_collection->at(i, k); - for (size_type j = i + 1; j < final_iter_nums[k]; ++j) { - temp -= - hessenberg->at( - i, j * residual_norm_collection->get_size()[1] + k) * - y->at(j, k); - } - y->at(i, k) = - temp / hessenberg->at( - i, i * residual_norm_collection->get_size()[1] + k); - } - } -} - - -template -void calculate_qy(const matrix::Dense* krylov_bases, - const matrix::Dense* y, - matrix::Dense* before_preconditioner, - const size_type* final_iter_nums) +void multi_axpy(std::shared_ptr exec, + const matrix::Dense* krylov_bases, + const matrix::Dense* y, + matrix::Dense* before_preconditioner, + const size_type* final_iter_nums, stopping_status* stop_status) { const auto krylov_bases_rowoffset = before_preconditioner->get_size()[0]; for (size_type k = 0; k < before_preconditioner->get_size()[1]; ++k) { + if (stop_status[k].is_finalized()) { + continue; + } for (size_type i = 0; i < before_preconditioner->get_size()[0]; ++i) { before_preconditioner->at(i, k) = zero(); for (size_type j = 0; j < final_iter_nums[k]; ++j) { @@ -222,109 +92,13 @@ void calculate_qy(const matrix::Dense* krylov_bases, y->at(j, k); } } - } -} - - -} // namespace - - -template -void initialize_1(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) -{ - using NormValueType = remove_complex; - for (size_type j = 0; j < b->get_size()[1]; ++j) { - for (size_type i = 0; i < b->get_size()[0]; ++i) { - residual->at(i, j) = b->at(i, j); - } - for (size_type i = 0; i < krylov_dim; ++i) { - givens_sin->at(i, j) = zero(); - givens_cos->at(i, j) = zero(); + if (stop_status[k].has_stopped()) { + stop_status[k].finalize(); } - stop_status->get_data()[j].reset(); } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL); - - -template -void initialize_2(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense* krylov_bases, - array* final_iter_nums, array&, - size_type krylov_dim) -{ - for (size_type j = 0; j < residual->get_size()[1]; ++j) { - // Calculate residual norm - residual_norm->at(0, j) = 0; - for (size_type i = 0; i < residual->get_size()[0]; ++i) { - residual_norm->at(0, j) += squared_norm(residual->at(i, j)); - } - residual_norm->at(0, j) = sqrt(residual_norm->at(0, j)); - residual_norm_collection->at(0, j) = residual_norm->at(0, j); - for (size_type i = 0; i < residual->get_size()[0]; ++i) { - krylov_bases->at(i, j) = - residual->at(i, j) / residual_norm->at(0, j); - } - final_iter_nums->get_data()[j] = 0; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL); - - -template -void step_1(std::shared_ptr exec, size_type num_rows, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense* krylov_bases, - matrix::Dense* hessenberg_iter, size_type iter, - array* final_iter_nums, - const array* stop_status) -{ - for (size_type i = 0; i < final_iter_nums->get_num_elems(); ++i) { - final_iter_nums->get_data()[i] += - (1 - stop_status->get_const_data()[i].has_stopped()); - } - - finish_arnoldi(num_rows, krylov_bases, hessenberg_iter, iter, - stop_status->get_const_data()); - givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter, - stop_status->get_const_data()); - calculate_next_residual_norm(givens_sin, givens_cos, residual_norm, - residual_norm_collection, iter, - stop_status->get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_1_KERNEL); - - -template -void step_2(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - const matrix::Dense* krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - solve_upper_triangular(residual_norm_collection, hessenberg, y, - final_iter_nums->get_const_data()); - calculate_qy(krylov_bases, y, before_preconditioner, - final_iter_nums->get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_STEP_2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL); } // namespace gmres diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 4d7306a4ae7..927d106bdb5 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -33,6 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include + + #include @@ -47,6 +51,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/solver/common_gmres_kernels.hpp" +#include "core/solver/gmres_kernels.hpp" #include "core/test/utils.hpp" @@ -57,10 +63,14 @@ template class Gmres : public ::testing::Test { protected: using value_type = T; + using rc_value_type = gko::remove_complex; using Mtx = gko::matrix::Dense; + using rc_Mtx = gko::matrix::Dense; using Solver = gko::solver::Gmres; Gmres() : exec(gko::ReferenceExecutor::create()), + stopped{}, + non_stopped{}, mtx(gko::initialize( {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec)), gmres_factory( @@ -73,6 +83,7 @@ class Gmres : public ::testing::Test { gko::stop::ResidualNorm::build() .with_reduction_factor(r::value) .on(exec)) + .with_krylov_dim(3u) .on(exec)), mtx_big(gko::initialize( {{2295.7, -764.8, 1166.5, 428.9, 291.7, -774.5}, @@ -107,9 +118,55 @@ class Gmres : public ::testing::Test { {-111.40, -22.60, 110.10, -106.20, 88.90}, {-0.70, 111.70, 154.40, 235.00, -76.50}}, exec)) - {} + { + auto small_size = gko::dim<2>{3, 2}; + constexpr gko::size_type small_restart{2}; + small_b = gko::initialize( + {I{1., 2.}, I{3., 4.}, I{5., 6.}}, exec); + small_x = Mtx::create(exec, small_size); + small_residual = Mtx::create(exec, small_size); + small_residual_norm = + rc_Mtx::create(exec, gko::dim<2>{1, small_size[1]}); + small_residual_norm_collection = + Mtx::create(exec, gko::dim<2>{small_restart + 1, small_size[1]}); + small_krylov_bases = Mtx::create( + exec, + gko::dim<2>{small_size[0] * (small_restart + 1), small_size[1]}); + + small_givens_sin = + Mtx::create(exec, gko::dim<2>{small_restart, small_size[1]}); + small_givens_cos = + Mtx::create(exec, gko::dim<2>{small_restart, small_size[1]}); + small_y = Mtx::create(exec, gko::dim<2>{small_restart, small_size[1]}); + small_hessenberg = Mtx::create( + exec, + gko::dim<2>{small_restart + 1, small_restart * small_size[1]}); + small_hessenberg->fill(gko::zero()); + + stopped.converge(1, true); + non_stopped.reset(); + small_stop = gko::array(exec, small_size[1]); + std::fill_n(small_stop.get_data(), small_stop.get_num_elems(), + non_stopped); + small_final_iter_nums = gko::array(exec, small_size[1]); + } - std::shared_ptr exec; + std::shared_ptr exec; + std::unique_ptr small_x; + std::unique_ptr small_b; + std::unique_ptr small_residual; + std::unique_ptr small_residual_norm; + std::unique_ptr small_residual_norm_collection; + std::unique_ptr small_krylov_bases; + std::unique_ptr small_givens_sin; + std::unique_ptr small_givens_cos; + std::unique_ptr small_y; + std::unique_ptr small_hessenberg; + gko::array small_final_iter_nums; + gko::array small_stop; + + gko::stopping_status stopped; + gko::stopping_status non_stopped; std::shared_ptr mtx; std::shared_ptr mtx_medium; std::shared_ptr mtx_big; @@ -121,6 +178,234 @@ class Gmres : public ::testing::Test { TYPED_TEST_SUITE(Gmres, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST(Gmres, KernelInitialize) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + const T nan = std::numeric_limits>::quiet_NaN(); + this->small_residual->fill(nan); + this->small_givens_sin->fill(nan); + this->small_givens_cos->fill(nan); + std::fill_n(this->small_stop.get_data(), this->small_stop.get_num_elems(), + this->stopped); + auto expected_sin_cos = + Mtx::create(this->exec, this->small_givens_sin->get_size()); + expected_sin_cos->fill(gko::zero()); + + gko::kernels::reference::common_gmres::initialize( + this->exec, this->small_b.get(), this->small_residual.get(), + this->small_givens_sin.get(), this->small_givens_cos.get(), + this->small_stop.get_data()); + + GKO_ASSERT_MTX_NEAR(this->small_residual, this->small_b, 0); + GKO_ASSERT_MTX_NEAR(this->small_givens_sin, expected_sin_cos, 0); + GKO_ASSERT_MTX_NEAR(this->small_givens_cos, expected_sin_cos, 0); + for (int i = 0; i < this->small_stop.get_num_elems(); ++i) { + ASSERT_EQ(this->small_stop.get_data()[i], this->non_stopped); + } +} + + +TYPED_TEST(Gmres, KernelRestart) +{ + using value_type = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + const value_type nan = + std::numeric_limits>::quiet_NaN(); + this->small_residual->copy_from(this->small_b.get()); + this->small_residual->compute_norm2(this->small_residual_norm.get()); + this->small_residual_norm_collection->fill(nan); + this->small_krylov_bases->fill(9999); + std::fill_n(this->small_final_iter_nums.get_data(), + this->small_final_iter_nums.get_num_elems(), 999); + auto expected_krylov = gko::clone(this->exec, this->small_krylov_bases); + const auto small_size = this->small_residual->get_size(); + for (int i = 0; i < small_size[0]; ++i) { + for (int j = 0; j < small_size[1]; ++j) { + expected_krylov + ->get_values()[(0 * small_size[0] + i) * small_size[1] + j] = + this->small_residual + ->get_const_values()[i * small_size[1] + j] / + this->small_residual_norm->get_const_values()[j]; + } + } + + gko::kernels::reference::gmres::restart( + this->exec, this->small_residual.get(), this->small_residual_norm.get(), + this->small_residual_norm_collection.get(), + this->small_krylov_bases.get(), this->small_final_iter_nums.get_data()); + + ASSERT_EQ(this->small_final_iter_nums.get_num_elems(), + this->small_residual_norm_collection->get_size()[1]); + for (int i = 0; i < this->small_final_iter_nums.get_num_elems(); ++i) { + ASSERT_EQ(this->small_final_iter_nums.get_const_data()[i], 0); + ASSERT_EQ(this->small_residual_norm_collection->get_const_values()[i], + this->small_residual_norm->get_const_values()[i]); + } + GKO_ASSERT_MTX_NEAR(this->small_krylov_bases, expected_krylov, + r::value); +} + + +TYPED_TEST(Gmres, KernelHessenbergQrIter0) +{ + using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + using std::sqrt; + const auto nan = std::numeric_limits>::quiet_NaN(); + const gko::size_type iteration{0}; + this->small_givens_cos = + gko::initialize({I{-0.5, 1.}, I{70., -71}}, this->exec); + this->small_givens_sin = + gko::initialize({I{1., 0.}, I{-72., 73.}}, this->exec); + this->small_residual_norm->fill(nan); + this->small_residual_norm_collection = gko::initialize( + {I{1.25, 1.5}, I{nan, nan}, I{95., 94.}}, this->exec); + this->small_hessenberg = gko::initialize( + {I{0.5, -0.75}, I{-0.5, 1}, I{97., 96.}}, this->exec); + this->small_final_iter_nums.get_data()[0] = 0; + this->small_final_iter_nums.get_data()[1] = 0; + + gko::kernels::reference::common_gmres::hessenberg_qr( + this->exec, this->small_givens_sin.get(), this->small_givens_cos.get(), + this->small_residual_norm.get(), + this->small_residual_norm_collection.get(), + this->small_hessenberg.get(), iteration, + this->small_final_iter_nums.get_data(), + this->small_stop.get_const_data()); + + ASSERT_EQ(this->small_final_iter_nums.get_data()[0], 1); + ASSERT_EQ(this->small_final_iter_nums.get_data()[1], 1); + GKO_EXPECT_MTX_NEAR(this->small_givens_cos, + l({{0.5 * sqrt(2.), -0.6}, {70., -71.}}), r::value); + GKO_EXPECT_MTX_NEAR(this->small_givens_sin, + l({{-0.5 * sqrt(2.), 0.8}, {-72., 73.}}), r::value); + GKO_EXPECT_MTX_NEAR(this->small_hessenberg, + l({{0.5 * sqrt(2.), 1.25}, {0., 0.}, {97., 96.}}), + r::value); + GKO_EXPECT_MTX_NEAR( + this->small_residual_norm_collection, + l({{0.625 * sqrt(2.), -0.9}, {0.625 * sqrt(2.), -1.2}, {95., 94.}}), + r::value); + GKO_EXPECT_MTX_NEAR(this->small_residual_norm, l({{0.625 * sqrt(2.), 1.2}}), + r::value); +} + + +TYPED_TEST(Gmres, KernelHessenbergQrIter1) +{ + using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + using std::sqrt; + const auto nan = std::numeric_limits>::quiet_NaN(); + const gko::size_type iteration{1}; + this->small_givens_cos = + gko::initialize({I{1., 0.5}, I{-0.5, 1.}}, this->exec); + this->small_givens_sin = + gko::initialize({I{0.5, 0.25}, I{1., 0.}}, this->exec); + this->small_residual_norm->fill(nan); + this->small_residual_norm_collection = gko::initialize( + {I{95., 94.}, I{1.25, 1.5}, I{nan, nan}}, this->exec); + this->small_hessenberg = gko::initialize( + {I{-0.5, 4}, I{0.25, 0.5}, I{-0.5, 1}}, this->exec); + this->small_final_iter_nums.get_data()[0] = 1; + this->small_final_iter_nums.get_data()[1] = 1; + + gko::kernels::reference::common_gmres::hessenberg_qr( + this->exec, this->small_givens_sin.get(), this->small_givens_cos.get(), + this->small_residual_norm.get(), + this->small_residual_norm_collection.get(), + this->small_hessenberg.get(), iteration, + this->small_final_iter_nums.get_data(), + this->small_stop.get_const_data()); + + ASSERT_EQ(this->small_final_iter_nums.get_data()[0], 2); + ASSERT_EQ(this->small_final_iter_nums.get_data()[1], 2); + GKO_EXPECT_MTX_NEAR(this->small_givens_cos, + l({{1., 0.5}, {0.5 * sqrt(2.), -0.6}}), r::value); + GKO_EXPECT_MTX_NEAR(this->small_givens_sin, + l({{0.5, 0.25}, {-0.5 * sqrt(2.), 0.8}}), r::value); + GKO_EXPECT_MTX_NEAR(this->small_hessenberg, + l({{-0.375, 2.125}, {0.5 * sqrt(2.), 1.25}, {0., 0.}}), + r::value); + GKO_EXPECT_MTX_NEAR( + this->small_residual_norm_collection, + l({{95., 94.}, {0.625 * sqrt(2.), -0.9}, {0.625 * sqrt(2.), -1.2}}), + r::value); + GKO_EXPECT_MTX_NEAR(this->small_residual_norm, l({{0.625 * sqrt(2.), 1.2}}), + r::value); +} + + +TYPED_TEST(Gmres, KernelSolveKrylov) +{ + using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + const T nan = std::numeric_limits>::quiet_NaN(); + const auto restart = this->small_givens_sin->get_size()[0]; + this->small_y->fill(nan); + this->small_final_iter_nums.get_data()[0] = restart; + this->small_final_iter_nums.get_data()[1] = restart; + this->small_hessenberg = gko::initialize( + // clang-format off + {{-1, 3, 2, -4}, + {0, 0, 1, 5}, + {nan, nan, nan, nan}}, + // clang-format on + this->exec); + this->small_residual_norm_collection = + gko::initialize({I{12, 3}, I{-3, 15}}, this->exec); + + gko::kernels::reference::common_gmres::solve_krylov( + this->exec, this->small_residual_norm_collection.get(), + this->small_hessenberg.get(), this->small_y.get(), + this->small_final_iter_nums.get_const_data(), + this->small_stop.get_const_data()); + + GKO_ASSERT_MTX_NEAR(this->small_y, l({{-18., 5.}, {-3., 3.}}), r::value); +} + + +TYPED_TEST(Gmres, KernelMultiAxpy) +{ + using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + const T nan = std::numeric_limits>::quiet_NaN(); + const auto restart = this->small_givens_sin->get_size()[0]; + this->small_x->fill(nan); + this->small_y = + gko::initialize({I{1., 2.}, I{3., -1.}}, this->exec); + this->small_final_iter_nums.get_data()[0] = restart; + this->small_final_iter_nums.get_data()[1] = restart; + this->small_krylov_bases = gko::initialize( // restart+1 x rows x #rhs + { + I{1, 10}, // 0, 0, x + I{2, 11}, // 0, 1, x + I{3, 12}, // 0, 2, x + I{4, 13}, // 1, 0, x + I{5, 14}, // 1, 1, x + I{6, 15}, // 1, 2, x + I{nan, nan}, // 2, 0, x + I{nan, nan}, // 2, 1, x + I{nan, nan}, // 2, 2, x + }, + this->exec); + this->small_stop.get_data()[0].stop(7, false); + gko::stopping_status expected_stop{}; + expected_stop.stop(7, true); + + gko::kernels::reference::gmres::multi_axpy( + this->exec, this->small_krylov_bases.get(), this->small_y.get(), + this->small_x.get(), this->small_final_iter_nums.get_const_data(), + this->small_stop.get_data()); + + ASSERT_EQ(this->small_stop.get_const_data()[0], expected_stop); + ASSERT_EQ(this->small_stop.get_const_data()[1], this->non_stopped); + GKO_ASSERT_MTX_NEAR(this->small_x, l({{13., 7.}, {17., 8.}, {21., 9.}}), + r::value); +} + + TYPED_TEST(Gmres, SolvesStencilSystem) { using Mtx = typename TestFixture::Mtx; @@ -418,12 +703,9 @@ TYPED_TEST(Gmres, SolvesMultipleDenseSystemForDivergenceCheck) auto alpha = gko::initialize({1.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); - auto residual1 = Mtx::create(this->exec, b1->get_size()); - residual1->copy_from(b1.get()); - auto residual2 = Mtx::create(this->exec, b2->get_size()); - residual2->copy_from(b2.get()); - auto residualC = Mtx::create(this->exec, bc->get_size()); - residualC->copy_from(bc.get()); + auto residual1 = gko::clone(this->exec, b1); + auto residual2 = gko::clone(this->exec, b2); + auto residualC = gko::clone(this->exec, bc); this->mtx_big->apply(alpha.get(), x1.get(), beta.get(), residual1.get()); this->mtx_big->apply(alpha.get(), x2.get(), beta.get(), residual2.get()); @@ -441,8 +723,8 @@ TYPED_TEST(Gmres, SolvesMultipleDenseSystemForDivergenceCheck) ASSERT_LE(normC1 / normB1, normS1 / normB1 + r::value); ASSERT_LE(normC2 / normB2, normS2 / normB2 + r::value); - // Not sure if this is necessary, the assertions above should cover what is - // needed. + // Not sure if this is necessary, the assertions above should cover what + // is needed. GKO_ASSERT_MTX_NEAR(xc, mergedRes, r::value); } diff --git a/test/solver/cb_gmres_kernels.cpp b/test/solver/cb_gmres_kernels.cpp index 9947d3bd5f6..d78f839e26f 100644 --- a/test/solver/cb_gmres_kernels.cpp +++ b/test/solver/cb_gmres_kernels.cpp @@ -234,10 +234,10 @@ TEST_F(CbGmres, CbGmresInitialize1IsEquivalentToRef) { initialize_data(); - gko::kernels::reference::cb_gmres::initialize_1( + gko::kernels::reference::cb_gmres::initialize( ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(), stop_status.get(), default_krylov_dim_mixed); - gko::kernels::EXEC_NAMESPACE::cb_gmres::initialize_1( + gko::kernels::EXEC_NAMESPACE::cb_gmres::initialize( exec, d_b.get(), d_residual.get(), d_givens_sin.get(), d_givens_cos.get(), d_stop_status.get(), default_krylov_dim_mixed); @@ -253,12 +253,12 @@ TEST_F(CbGmres, CbGmresInitialize2IsEquivalentToRef) gko::array tmp{ref}; gko::array dtmp{exec}; - gko::kernels::reference::cb_gmres::initialize_2( + gko::kernels::reference::cb_gmres::restart( ref, residual.get(), residual_norm.get(), residual_norm_collection.get(), arnoldi_norm.get(), range_helper.get_range(), next_krylov_basis.get(), final_iter_nums.get(), tmp, default_krylov_dim_mixed); - gko::kernels::EXEC_NAMESPACE::cb_gmres::initialize_2( + gko::kernels::EXEC_NAMESPACE::cb_gmres::restart( exec, d_residual.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_arnoldi_norm.get(), d_range_helper.get_range(), d_next_krylov_basis.get(), @@ -277,13 +277,13 @@ TEST_F(CbGmres, CbGmresStep1IsEquivalentToRef) initialize_data(); int iter = 5; - gko::kernels::reference::cb_gmres::step_1( + gko::kernels::reference::cb_gmres::arnoldi( ref, next_krylov_basis.get(), givens_sin.get(), givens_cos.get(), residual_norm.get(), residual_norm_collection.get(), range_helper.get_range(), hessenberg_iter.get(), buffer_iter.get(), arnoldi_norm.get(), iter, final_iter_nums.get(), stop_status.get(), reorth_status.get(), num_reorth.get()); - gko::kernels::EXEC_NAMESPACE::cb_gmres::step_1( + gko::kernels::EXEC_NAMESPACE::cb_gmres::arnoldi( exec, d_next_krylov_basis.get(), d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_range_helper.get_range(), d_hessenberg_iter.get(), @@ -309,11 +309,11 @@ TEST_F(CbGmres, CbGmresStep2IsEquivalentToRef) { initialize_data(); - gko::kernels::reference::cb_gmres::step_2( + gko::kernels::reference::cb_gmres::solve_krylov( ref, residual_norm_collection.get(), range_helper.get_range().get_accessor().to_const(), hessenberg.get(), y.get(), before_preconditioner.get(), final_iter_nums.get()); - gko::kernels::EXEC_NAMESPACE::cb_gmres::step_2( + gko::kernels::EXEC_NAMESPACE::cb_gmres::solve_krylov( exec, d_residual_norm_collection.get(), d_range_helper.get_range().get_accessor().to_const(), d_hessenberg.get(), d_y.get(), d_before_preconditioner.get(), diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp index 4740d946bce..9f55adf75c4 100644 --- a/test/solver/gmres_kernels.cpp +++ b/test/solver/gmres_kernels.cpp @@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/solver/common_gmres_kernels.hpp" #include "core/test/utils.hpp" #include "test/utils/executor.hpp" @@ -115,15 +116,13 @@ class Gmres : public CommonTestFixture { gen_mtx(gko::solver::default_krylov_dim + 1, nrhs); givens_sin = gen_mtx(gko::solver::default_krylov_dim, nrhs); givens_cos = gen_mtx(gko::solver::default_krylov_dim, nrhs); - stop_status = - std::make_unique>(ref, nrhs); - for (size_t i = 0; i < stop_status->get_num_elems(); ++i) { - stop_status->get_data()[i].reset(); + stop_status = gko::array(ref, nrhs); + for (size_t i = 0; i < stop_status.get_num_elems(); ++i) { + stop_status.get_data()[i].reset(); } - final_iter_nums = - std::make_unique>(ref, nrhs); - for (size_t i = 0; i < final_iter_nums->get_num_elems(); ++i) { - final_iter_nums->get_data()[i] = 5; + final_iter_nums = gko::array(ref, nrhs); + for (size_t i = 0; i < final_iter_nums.get_num_elems(); ++i) { + final_iter_nums.get_data()[i] = 5; } d_x = gko::clone(exec, x); @@ -138,10 +137,8 @@ class Gmres : public CommonTestFixture { d_residual_norm_collection = gko::clone(exec, residual_norm_collection); d_givens_sin = gko::clone(exec, givens_sin); d_givens_cos = gko::clone(exec, givens_cos); - d_stop_status = std::make_unique>( - exec, *stop_status); - d_final_iter_nums = std::make_unique>( - exec, *final_iter_nums); + d_stop_status = gko::array(exec, stop_status); + d_final_iter_nums = gko::array(exec, final_iter_nums); } std::default_random_engine rand_engine; @@ -163,8 +160,8 @@ class Gmres : public CommonTestFixture { std::unique_ptr residual_norm_collection; std::unique_ptr givens_sin; std::unique_ptr givens_cos; - std::unique_ptr> stop_status; - std::unique_ptr> final_iter_nums; + gko::array stop_status; + gko::array final_iter_nums; std::unique_ptr d_x; std::unique_ptr d_before_preconditioner; @@ -178,67 +175,65 @@ class Gmres : public CommonTestFixture { std::unique_ptr d_residual_norm_collection; std::unique_ptr d_givens_sin; std::unique_ptr d_givens_cos; - std::unique_ptr> d_stop_status; - std::unique_ptr> d_final_iter_nums; + gko::array d_stop_status; + gko::array d_final_iter_nums; }; -TEST_F(Gmres, GmresInitialize1IsEquivalentToRef) +TEST_F(Gmres, GmresKernelInitializeIsEquivalentToRef) { initialize_data(); - gko::kernels::reference::gmres::initialize_1( + gko::kernels::reference::common_gmres::initialize( ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(), - stop_status.get(), gko::solver::default_krylov_dim); - gko::kernels::EXEC_NAMESPACE::gmres::initialize_1( + stop_status.get_data()); + gko::kernels::EXEC_NAMESPACE::common_gmres::initialize( exec, d_b.get(), d_residual.get(), d_givens_sin.get(), - d_givens_cos.get(), d_stop_status.get(), - gko::solver::default_krylov_dim); + d_givens_cos.get(), d_stop_status.get_data()); GKO_ASSERT_MTX_NEAR(d_residual, residual, r::value); GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, r::value); GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, r::value); - GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); + GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status); } -TEST_F(Gmres, GmresInitialize2IsEquivalentToRef) +TEST_F(Gmres, GmresKernelRestartIsEquivalentToRef) { initialize_data(); - gko::array tmp{ref}; - gko::array dtmp{exec}; + residual->compute_norm2(residual_norm.get()); + d_residual_norm->copy_from(residual_norm.get()); - gko::kernels::reference::gmres::initialize_2( + gko::kernels::reference::gmres::restart( ref, residual.get(), residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(), - final_iter_nums.get(), tmp, gko::solver::default_krylov_dim); - gko::kernels::EXEC_NAMESPACE::gmres::initialize_2( + final_iter_nums.get_data()); + gko::kernels::EXEC_NAMESPACE::gmres::restart( exec, d_residual.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_krylov_bases.get(), - d_final_iter_nums.get(), dtmp, gko::solver::default_krylov_dim); + d_final_iter_nums.get_data()); GKO_ASSERT_MTX_NEAR(d_residual_norm, residual_norm, r::value); GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection, r::value); GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, r::value); - GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums); + GKO_ASSERT_ARRAY_EQ(d_final_iter_nums, final_iter_nums); } -TEST_F(Gmres, GmresStep1IsEquivalentToRef) +TEST_F(Gmres, GmresKernelHessenbergQRIsEquivalentToRef) { initialize_data(); int iter = 5; - gko::kernels::reference::gmres::step_1( - ref, x->get_size()[0], givens_sin.get(), givens_cos.get(), - residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(), - hessenberg_iter.get(), iter, final_iter_nums.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::gmres::step_1( - exec, d_x->get_size()[0], d_givens_sin.get(), d_givens_cos.get(), - d_residual_norm.get(), d_residual_norm_collection.get(), - d_krylov_bases.get(), d_hessenberg_iter.get(), iter, - d_final_iter_nums.get(), d_stop_status.get()); + gko::kernels::reference::common_gmres::hessenberg_qr( + ref, givens_sin.get(), givens_cos.get(), residual_norm.get(), + residual_norm_collection.get(), hessenberg_iter.get(), iter, + final_iter_nums.get_data(), stop_status.get_const_data()); + gko::kernels::EXEC_NAMESPACE::common_gmres::hessenberg_qr( + exec, d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(), + d_residual_norm_collection.get(), d_hessenberg_iter.get(), iter, + d_final_iter_nums.get_data(), d_stop_status.get_const_data()); GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, r::value); GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, r::value); @@ -246,26 +241,25 @@ TEST_F(Gmres, GmresStep1IsEquivalentToRef) GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection, r::value); GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, - 2 * r::value); + r::value); GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, r::value); - GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums); + GKO_ASSERT_ARRAY_EQ(d_final_iter_nums, final_iter_nums); } -TEST_F(Gmres, GmresStep1OnSingleRHSIsEquivalentToRef) +TEST_F(Gmres, GmresKernelHessenbergQROnSingleRHSIsEquivalentToRef) { initialize_data(1); int iter = 5; - gko::kernels::reference::gmres::step_1( - ref, x->get_size()[0], givens_sin.get(), givens_cos.get(), - residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(), - hessenberg_iter.get(), iter, final_iter_nums.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::gmres::step_1( - exec, d_x->get_size()[0], d_givens_sin.get(), d_givens_cos.get(), - d_residual_norm.get(), d_residual_norm_collection.get(), - d_krylov_bases.get(), d_hessenberg_iter.get(), iter, - d_final_iter_nums.get(), d_stop_status.get()); + gko::kernels::reference::common_gmres::hessenberg_qr( + ref, givens_sin.get(), givens_cos.get(), residual_norm.get(), + residual_norm_collection.get(), hessenberg_iter.get(), iter, + final_iter_nums.get_data(), stop_status.get_const_data()); + gko::kernels::EXEC_NAMESPACE::common_gmres::hessenberg_qr( + exec, d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(), + d_residual_norm_collection.get(), d_hessenberg_iter.get(), iter, + d_final_iter_nums.get_data(), d_stop_status.get_const_data()); GKO_ASSERT_MTX_NEAR(d_givens_sin, givens_sin, r::value); GKO_ASSERT_MTX_NEAR(d_givens_cos, givens_cos, r::value); @@ -273,27 +267,41 @@ TEST_F(Gmres, GmresStep1OnSingleRHSIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection, r::value); GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, - 2 * r::value); + r::value); GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, r::value); - GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums); + GKO_ASSERT_ARRAY_EQ(d_final_iter_nums, final_iter_nums); } -TEST_F(Gmres, GmresStep2IsEquivalentToRef) +TEST_F(Gmres, GmresKernelSolveKrylovIsEquivalentToRef) { initialize_data(); - gko::kernels::reference::gmres::step_2(ref, residual_norm_collection.get(), - krylov_bases.get(), hessenberg.get(), - y.get(), before_preconditioner.get(), - final_iter_nums.get()); - gko::kernels::EXEC_NAMESPACE::gmres::step_2( - exec, d_residual_norm_collection.get(), d_krylov_bases.get(), - d_hessenberg.get(), d_y.get(), d_before_preconditioner.get(), - d_final_iter_nums.get()); + gko::kernels::reference::common_gmres::solve_krylov( + ref, residual_norm_collection.get(), hessenberg.get(), y.get(), + final_iter_nums.get_const_data(), stop_status.get_const_data()); + gko::kernels::EXEC_NAMESPACE::common_gmres::solve_krylov( + exec, d_residual_norm_collection.get(), d_hessenberg.get(), d_y.get(), + d_final_iter_nums.get_const_data(), d_stop_status.get_const_data()); GKO_ASSERT_MTX_NEAR(d_y, y, r::value); - GKO_ASSERT_MTX_NEAR(d_x, x, r::value); +} + + +TEST_F(Gmres, GmresKernelMultiAxpyIsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::gmres::multi_axpy( + ref, krylov_bases.get(), y.get(), before_preconditioner.get(), + final_iter_nums.get_const_data(), stop_status.get_data()); + gko::kernels::EXEC_NAMESPACE::gmres::multi_axpy( + exec, d_krylov_bases.get(), d_y.get(), d_before_preconditioner.get(), + d_final_iter_nums.get_const_data(), d_stop_status.get_data()); + + GKO_ASSERT_MTX_NEAR(d_before_preconditioner, before_preconditioner, + r::value); + GKO_ASSERT_ARRAY_EQ(stop_status, d_stop_status); } @@ -311,5 +319,25 @@ TEST_F(Gmres, GmresApplyOneRHSIsEquivalentToRef) ref_solver->apply(b.get(), x.get()); exec_solver->apply(d_b.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_x, x, r::value * 100); + GKO_ASSERT_MTX_NEAR(d_b, b, 0); + GKO_ASSERT_MTX_NEAR(d_x, x, r::value * 1e2); +} + + +TEST_F(Gmres, GmresApplyMultipleRHSIsEquivalentToRef) +{ + int m = 123; + int n = 5; + auto ref_solver = ref_gmres_factory->generate(mtx); + auto exec_solver = exec_gmres_factory->generate(d_mtx); + auto b = gen_mtx(m, n); + auto x = gen_mtx(m, n); + auto d_b = gko::clone(exec, b); + auto d_x = gko::clone(exec, x); + + ref_solver->apply(b.get(), x.get()); + exec_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_b, b, 0); + GKO_ASSERT_MTX_NEAR(d_x, x, r::value * 1e3); }