From 96b16778c7ddcc86785e341ed23e48809fe24a14 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 24 Nov 2021 17:43:59 +0800
Subject: [PATCH 01/11] script port

---
 ...cobi_advanced_apply_instantiate.inc.dp.cpp | 237 ++++++++
 .../jacobi_advanced_apply_kernel.dp.cpp       | 106 ++++
 dpcpp/preconditioner/jacobi_common.hpp.in     |  71 +++
 .../jacobi_generate_instantiate.inc.dp.cpp    | 364 ++++++++++++
 .../jacobi_generate_kernel.dp.cpp             | 103 ++++
 dpcpp/preconditioner/jacobi_kernels.dp.cpp    | 558 +++++++++++++-----
 ...jacobi_simple_apply_instantiate.inc.dp.cpp | 229 +++++++
 .../jacobi_simple_apply_kernel.dp.cpp         | 102 ++++
 8 files changed, 1616 insertions(+), 154 deletions(-)
 create mode 100644 dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
 create mode 100644 dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
 create mode 100644 dpcpp/preconditioner/jacobi_common.hpp.in
 create mode 100644 dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
 create mode 100644 dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
 create mode 100644 dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
 create mode 100644 dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
new file mode 100644
index 00000000000..4425f8e277b
--- /dev/null
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
@@ -0,0 +1,237 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <dpcpp/preconditioner/jacobi_common.hpp>
+
+
+#include <CL/sycl.hpp>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "core/base/extended_float.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/components/warp_blas.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+namespace kernel {
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+void advanced_apply(
+    const ValueType* __restrict__ blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
+    const ValueType* __restrict__ alpha, const ValueType* __restrict__ b,
+    int32 b_stride, ValueType* __restrict__ x, int32 x_stride,
+    sycl::nd_item<3> item_ct1)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>(item_ct1);
+    const auto subwarp = group::tiled_partition<subwarp_size>(
+        group::this_thread_block(item_ct1));
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+    ValueType v = zero<ValueType>();
+    if (subwarp.thread_rank() < block_size) {
+        v = alpha[0] *
+            b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
+    }
+    multiply_vec<max_block_size>(
+        subwarp, block_size, v,
+        blocks + storage_scheme.get_global_block_offset(block_id) +
+            subwarp.thread_rank(),
+        storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
+        x_stride,
+        [](ValueType& result, const ValueType& out) { result += out; });
+}
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+void advanced_apply(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    const ValueType* blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const IndexType* block_ptrs, size_type num_blocks, const ValueType* alpha,
+    const ValueType* b, int32 b_stride, ValueType* x, int32 x_stride)
+{
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            advanced_apply<max_block_size, subwarp_size, warps_per_block>(
+                blocks, storage_scheme, block_ptrs, num_blocks, alpha, b,
+                b_stride, x, x_stride, item_ct1);
+        });
+}
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+void advanced_adaptive_apply(
+    const ValueType* __restrict__ blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const precision_reduction* __restrict__ block_precisions,
+    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
+    const ValueType* __restrict__ alpha, const ValueType* __restrict__ b,
+    int32 b_stride, ValueType* __restrict__ x, int32 x_stride,
+    sycl::nd_item<3> item_ct1)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>(item_ct1);
+    const auto subwarp = group::tiled_partition<subwarp_size>(
+        group::this_thread_block(item_ct1));
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+    auto alpha_val = alpha == nullptr ? one<ValueType>() : alpha[0];
+    ValueType v = zero<ValueType>();
+    if (subwarp.thread_rank() < block_size) {
+        v = alpha[0] *
+            b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
+    }
+    GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+        ValueType, block_precisions[block_id],
+        multiply_vec<max_block_size>(
+            subwarp, block_size, v,
+            reinterpret_cast<const resolved_precision*>(
+                blocks + storage_scheme.get_group_offset(block_id)) +
+                storage_scheme.get_block_offset(block_id) +
+                subwarp.thread_rank(),
+            storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
+            x_stride,
+            [](ValueType& result, const ValueType& out) { result += out; }));
+}
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+void advanced_adaptive_apply(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    const ValueType* blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const precision_reduction* block_precisions, const IndexType* block_ptrs,
+    size_type num_blocks, const ValueType* alpha, const ValueType* b,
+    int32 b_stride, ValueType* x, int32 x_stride)
+{
+    queue->parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3>
+                                                            item_ct1) {
+        advanced_adaptive_apply<max_block_size, subwarp_size, warps_per_block>(
+            blocks, storage_scheme, block_precisions, block_ptrs, num_blocks,
+            alpha, b, b_stride, x, x_stride, item_ct1);
+    });
+}
+
+
+}  // namespace kernel
+
+
+// clang-format off
+//#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
+// clang-format on
+// make things easier for IDEs
+#ifndef GKO_JACOBI_BLOCK_SIZE
+#define GKO_JACOBI_BLOCK_SIZE 1
+#endif
+
+
+template <int warps_per_block, int max_block_size, typename ValueType,
+          typename IndexType>
+void advanced_apply(
+    syn::value_list<int, max_block_size>, size_type num_blocks,
+    const precision_reduction* block_precisions,
+    const IndexType* block_pointers, const ValueType* blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>&
+        storage_scheme,
+    const ValueType* alpha, const ValueType* b, size_type b_stride,
+    ValueType* x, size_type x_stride)
+{
+    constexpr int subwarp_size = get_larger_power(max_block_size);
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
+    const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
+                         1, 1);
+    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
+
+    if (block_precisions) {
+        kernel::advanced_adaptive_apply<max_block_size, subwarp_size,
+                                        warps_per_block>(
+            grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme,
+            block_precisions, block_pointers, num_blocks, alpha, b, b_stride, x,
+            x_stride);
+    } else {
+        kernel::advanced_apply<max_block_size, subwarp_size, warps_per_block>(
+            grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme,
+            block_pointers, num_blocks, alpha, b, b_stride, x, x_stride);
+    }
+}
+
+
+#define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType)   \
+    void advanced_apply<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE, \
+                        ValueType, IndexType>(                              \
+        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>, size_type,             \
+        const precision_reduction*, const IndexType* block_pointers,        \
+        const ValueType*,                                                   \
+        const preconditioner::block_interleaved_storage_scheme<IndexType>&, \
+        const ValueType*, const ValueType*, size_type, ValueType*, size_type)
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION);
+
+
+}  // namespace jacobi
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
new file mode 100644
index 00000000000..1db62f49db3
--- /dev/null
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
@@ -0,0 +1,106 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <dpcpp/preconditioner/jacobi_common.hpp>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "core/matrix/dense_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+template <int warps_per_block, int max_block_size, typename ValueType,
+          typename IndexType>
+void advanced_apply(
+    syn::value_list<int, max_block_size>, size_type num_blocks,
+    const precision_reduction* block_precisions,
+    const IndexType* block_pointers, const ValueType* blocks,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>&
+        storage_scheme,
+    const ValueType* alpha, const ValueType* b, size_type b_stride,
+    ValueType* x, size_type x_stride);
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_advanced_apply, advanced_apply);
+
+
+template <typename ValueType, typename IndexType>
+void apply(std::shared_ptr<const DpcppExecutor> exec, size_type num_blocks,
+           uint32 max_block_size,
+           const preconditioner::block_interleaved_storage_scheme<IndexType>&
+               storage_scheme,
+           const Array<precision_reduction>& block_precisions,
+           const Array<IndexType>& block_pointers,
+           const Array<ValueType>& blocks,
+           const matrix::Dense<ValueType>* alpha,
+           const matrix::Dense<ValueType>* b,
+           const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* x)
+{
+    // TODO: write a special kernel for multiple RHS
+    dense::scale(exec, beta, x);
+    for (size_type col = 0; col < b->get_size()[1]; ++col) {
+        select_advanced_apply(
+            compiled_kernels(),
+            [&](int compiled_block_size) {
+                return max_block_size <= compiled_block_size;
+            },
+            syn::value_list<int, config::min_warps_per_block>(),
+            syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+            block_pointers.get_const_data(), blocks.get_const_data(),
+            storage_scheme, alpha->get_const_values(),
+            b->get_const_values() + col, b->get_stride(), x->get_values() + col,
+            x->get_stride());
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
+
+
+}  // namespace jacobi
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/preconditioner/jacobi_common.hpp.in b/dpcpp/preconditioner/jacobi_common.hpp.in
new file mode 100644
index 00000000000..93925a50342
--- /dev/null
+++ b/dpcpp/preconditioner/jacobi_common.hpp.in
@@ -0,0 +1,71 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/synthesizer/containers.hpp>
+
+
+#include "dpcpp/base/config.hpp"
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace jacobi {
+
+
+/**
+ * A compile-time list of block sizes for which dedicated generate and apply
+ * kernels should be compiled.
+ */
+// clang-format off
+#cmakedefine GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE @GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE@
+// clang-format on
+// make things easier for IDEs
+#ifndef GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE
+#define GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE 1
+#endif
+
+
+using compiled_kernels =
+    syn::value_list<int, GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE>;
+
+
+constexpr int get_larger_power(int value, int guess = 1)
+{
+    return guess >= value ? guess : get_larger_power(value, guess << 1);
+}
+
+
+}  // namespace jacobi
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
new file mode 100644
index 00000000000..2a759b743d8
--- /dev/null
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -0,0 +1,364 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <dpcpp/components/diagonal_block_manipulation.dp.hpp>
+#include <dpcpp/preconditioner/jacobi_common.hpp>
+
+
+#include <CL/sycl.hpp>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "core/base/extended_float.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/components/uninitialized_array.hpp"
+#include "dpcpp/components/warp_blas.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+namespace kernel {
+
+
+template <int max_block_size, typename ReducedType, typename Group,
+          typename ValueType, typename IndexType>
+__dpct_inline__ bool validate_precision_reduction_feasibility(
+    Group& __restrict__ group, IndexType block_size,
+    ValueType* __restrict__ row, ValueType* __restrict__ work, size_type stride)
+{
+    using gko::detail::float_traits;
+    // save original data and reduce precision
+    if (group.thread_rank() < block_size) {
+#pragma unroll
+        for (auto i = 0u; i < max_block_size; ++i) {
+            if (i < block_size) {
+                work[i * stride + group.thread_rank()] = row[i];
+                row[i] =
+                    static_cast<ValueType>(static_cast<ReducedType>(row[i]));
+            }
+        }
+    }
+
+    // compute the condition number
+    uint32 perm = group.thread_rank();
+    uint32 trans_perm = perm;
+    auto block_cond = compute_infinity_norm<max_block_size>(group, block_size,
+                                                            block_size, row);
+    auto succeeded = invert_block<max_block_size>(
+        group, static_cast<uint32>(block_size), row, perm, trans_perm);
+    block_cond *= compute_infinity_norm<max_block_size>(group, block_size,
+                                                        block_size, row);
+
+    // restore original data
+    if (group.thread_rank() < block_size) {
+#pragma unroll
+        for (auto i = 0u; i < max_block_size; ++i) {
+            if (i < block_size) {
+                row[i] = work[i * stride + group.thread_rank()];
+            }
+        }
+    }
+
+    return succeeded && block_cond >= 1.0 &&
+           block_cond * float_traits<remove_complex<ValueType>>::eps < 1e-3;
+}
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+void generate(
+    size_type num_rows, const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs,
+    const ValueType* __restrict__ values, ValueType* __restrict__ block_data,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
+    sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, max_block_size * warps_per_block>* workspace)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>(item_ct1);
+    const auto block = group::this_thread_block(item_ct1);
+    ValueType row[max_block_size];
+
+    csr::extract_transposed_diag_blocks<max_block_size, warps_per_block>(
+        block, config::warp_size / subwarp_size, row_ptrs, col_idxs, values,
+        block_ptrs, num_blocks, row, 1,
+        *workspace + item_ct1.get_local_id(0) * max_block_size);
+    const auto subwarp = group::tiled_partition<subwarp_size>(block);
+    if (block_id < num_blocks) {
+        const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+        uint32 perm = subwarp.thread_rank();
+        uint32 trans_perm = subwarp.thread_rank();
+        invert_block<max_block_size>(subwarp, static_cast<uint32>(block_size),
+                                     row, perm, trans_perm);
+        copy_matrix<max_block_size, and_transpose>(
+            subwarp, block_size, row, 1, perm, trans_perm,
+            block_data + storage_scheme.get_global_block_offset(block_id),
+            storage_scheme.get_stride());
+    }
+}
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+void generate(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs,
+    const ValueType* values, ValueType* block_data,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const IndexType* block_ptrs, size_type num_blocks)
+{
+    queue->submit([&](sycl::handler& cgh) {
+        sycl::accessor<
+            UninitializedArray<ValueType, max_block_size * warps_per_block>, 0,
+            sycl::access_mode::read_write, sycl::access::target::local>
+            workspace_acc_ct1(cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                generate<max_block_size, subwarp_size, warps_per_block>(
+                    num_rows, row_ptrs, col_idxs, values, block_data,
+                    storage_scheme, block_ptrs, num_blocks, item_ct1,
+                    workspace_acc_ct1.get_pointer().get());
+            });
+    });
+}
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+void adaptive_generate(
+    size_type num_rows, const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs,
+    const ValueType* __restrict__ values, remove_complex<ValueType> accuracy,
+    ValueType* __restrict__ block_data,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    remove_complex<ValueType>* __restrict__ conditioning,
+    precision_reduction* __restrict__ block_precisions,
+    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
+    sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, max_block_size * warps_per_block>* workspace)
+{
+    // extract blocks
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>(item_ct1);
+    const auto block = group::this_thread_block(item_ct1);
+    ValueType row[max_block_size];
+
+    csr::extract_transposed_diag_blocks<max_block_size, warps_per_block>(
+        block, config::warp_size / subwarp_size, row_ptrs, col_idxs, values,
+        block_ptrs, num_blocks, row, 1,
+        *workspace + item_ct1.get_local_id(0) * max_block_size);
+
+    // compute inverse and figure out the correct precision
+    const auto subwarp = group::tiled_partition<subwarp_size>(block);
+    const uint32 block_size =
+        block_id < num_blocks ? block_ptrs[block_id + 1] - block_ptrs[block_id]
+                              : 0;
+    uint32 perm = subwarp.thread_rank();
+    uint32 trans_perm = subwarp.thread_rank();
+    auto prec_descriptor = ~uint32{};
+    if (block_id < num_blocks) {
+        auto block_cond = compute_infinity_norm<max_block_size>(
+            subwarp, block_size, block_size, row);
+        invert_block<max_block_size>(subwarp, block_size, row, perm,
+                                     trans_perm);
+        block_cond *= compute_infinity_norm<max_block_size>(subwarp, block_size,
+                                                            block_size, row);
+        conditioning[block_id] = block_cond;
+        const auto prec = block_precisions[block_id];
+        prec_descriptor =
+            preconditioner::detail::precision_reduction_descriptor::singleton(
+                prec);
+        if (prec == precision_reduction::autodetect()) {
+            using preconditioner::detail::get_supported_storage_reductions;
+            prec_descriptor = get_supported_storage_reductions<ValueType>(
+                accuracy, block_cond,
+                [&subwarp, &block_size, &row, &block_data, &storage_scheme,
+                 &block_id] {
+                    using target = reduce_precision<ValueType>;
+                    return validate_precision_reduction_feasibility<
+                        max_block_size, target>(
+                        subwarp, block_size, row,
+                        block_data +
+                            storage_scheme.get_global_block_offset(block_id),
+                        storage_scheme.get_stride());
+                },
+                [&subwarp, &block_size, &row, &block_data, &storage_scheme,
+                 &block_id] {
+                    using target =
+                        reduce_precision<reduce_precision<ValueType>>;
+                    return validate_precision_reduction_feasibility<
+                        max_block_size, target>(
+                        subwarp, block_size, row,
+                        block_data +
+                            storage_scheme.get_global_block_offset(block_id),
+                        storage_scheme.get_stride());
+                });
+        }
+    }
+
+    // make sure all blocks in the group have the same precision
+    const auto warp = group::tiled_partition<config::warp_size>(block);
+    const auto prec =
+        preconditioner::detail::get_optimal_storage_reduction(reduce(
+            warp, prec_descriptor, [](uint32 x, uint32 y) { return x & y; }));
+
+    // store the block back into memory
+    if (block_id < num_blocks) {
+        block_precisions[block_id] = prec;
+        GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+            ValueType, prec,
+            copy_matrix<max_block_size, and_transpose>(
+                subwarp, block_size, row, 1, perm, trans_perm,
+                reinterpret_cast<resolved_precision*>(
+                    block_data + storage_scheme.get_group_offset(block_id)) +
+                    storage_scheme.get_block_offset(block_id),
+                storage_scheme.get_stride()));
+    }
+}
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+void adaptive_generate(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs,
+    const ValueType* values, remove_complex<ValueType> accuracy,
+    ValueType* block_data,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    remove_complex<ValueType>* conditioning,
+    precision_reduction* block_precisions, const IndexType* block_ptrs,
+    size_type num_blocks)
+{
+    queue->submit([&](sycl::handler& cgh) {
+        sycl::accessor<
+            UninitializedArray<ValueType, max_block_size * warps_per_block>, 0,
+            sycl::access_mode::read_write, sycl::access::target::local>
+            workspace_acc_ct1(cgh);
+
+        cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3>
+                                                             item_ct1) {
+            adaptive_generate<max_block_size, subwarp_size, warps_per_block>(
+                num_rows, row_ptrs, col_idxs, values, accuracy, block_data,
+                storage_scheme, conditioning, block_precisions, block_ptrs,
+                num_blocks, item_ct1, workspace_acc_ct1.get_pointer().get());
+        });
+    });
+}
+
+
+}  // namespace kernel
+
+
+// clang-format off
+//#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
+// clang-format on
+// make things easier for IDEs
+#ifndef GKO_JACOBI_BLOCK_SIZE
+#define GKO_JACOBI_BLOCK_SIZE 1
+#endif
+
+
+template <int warps_per_block, int max_block_size, typename ValueType,
+          typename IndexType>
+void generate(syn::value_list<int, max_block_size>,
+              const matrix::Csr<ValueType, IndexType>* mtx,
+              remove_complex<ValueType> accuracy, ValueType* block_data,
+              const preconditioner::block_interleaved_storage_scheme<IndexType>&
+                  storage_scheme,
+              remove_complex<ValueType>* conditioning,
+              precision_reduction* block_precisions,
+              const IndexType* block_ptrs, size_type num_blocks)
+{
+    constexpr int subwarp_size = get_larger_power(max_block_size);
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
+    const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
+                         1, 1);
+    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
+
+    if (block_precisions) {
+        kernel::adaptive_generate<max_block_size, subwarp_size,
+                                  warps_per_block>(
+            grid_size, block_size, 0, exec->get_queue(), mtx->get_size()[0],
+            mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            mtx->get_const_values(), accuracy, block_data, storage_scheme,
+            conditioning, block_precisions, block_ptrs, num_blocks);
+    } else {
+        kernel::generate<max_block_size, subwarp_size, warps_per_block>(
+            grid_size, block_size, 0, exec->get_queue(), mtx->get_size()[0],
+            mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            mtx->get_const_values(), block_data, storage_scheme, block_ptrs,
+            num_blocks);
+    }
+}
+
+
+#define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType)          \
+    void generate<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE,        \
+                  ValueType, IndexType>(                                     \
+        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>,                         \
+        const matrix::Csr<ValueType, IndexType>*, remove_complex<ValueType>, \
+        ValueType*,                                                          \
+        const preconditioner::block_interleaved_storage_scheme<IndexType>&,  \
+        remove_complex<ValueType>*, precision_reduction*, const IndexType*,  \
+        size_type)
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    DECLARE_JACOBI_GENERATE_INSTANTIATION);
+
+
+}  // namespace jacobi
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
new file mode 100644
index 00000000000..22e3f587748
--- /dev/null
+++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
@@ -0,0 +1,103 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <dpcpp/preconditioner/jacobi_common.hpp>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "core/components/fill_array_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+template <int warps_per_block, int max_block_size, typename ValueType,
+          typename IndexType>
+void generate(syn::value_list<int, max_block_size>,
+              const matrix::Csr<ValueType, IndexType>* mtx,
+              remove_complex<ValueType> accuracy, ValueType* block_data,
+              const preconditioner::block_interleaved_storage_scheme<IndexType>&
+                  storage_scheme,
+              remove_complex<ValueType>* conditioning,
+              precision_reduction* block_precisions,
+              const IndexType* block_ptrs, size_type num_blocks);
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generate, generate);
+
+
+template <typename ValueType, typename IndexType>
+void generate(std::shared_ptr<const DpcppExecutor> exec,
+              const matrix::Csr<ValueType, IndexType>* system_matrix,
+              size_type num_blocks, uint32 max_block_size,
+              remove_complex<ValueType> accuracy,
+              const preconditioner::block_interleaved_storage_scheme<IndexType>&
+                  storage_scheme,
+              Array<remove_complex<ValueType>>& conditioning,
+              Array<precision_reduction>& block_precisions,
+              const Array<IndexType>& block_pointers, Array<ValueType>& blocks)
+{
+    components::fill_array(exec, blocks.get_data(), blocks.get_num_elems(),
+                           zero<ValueType>());
+    select_generate(
+        compiled_kernels(),
+        [&](int compiled_block_size) {
+            return max_block_size <= compiled_block_size;
+        },
+        syn::value_list<int, config::min_warps_per_block>(), syn::type_list<>(),
+        system_matrix, accuracy, blocks.get_data(), storage_scheme,
+        conditioning.get_data(), block_precisions.get_data(),
+        block_pointers.get_const_data(), num_blocks);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_GENERATE_KERNEL);
+
+
+}  // namespace jacobi
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index 5406eaaf6cf..71c8101a8ee 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -33,26 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <algorithm>
-#include <cmath>
-#include <iterator>
-#include <numeric>
-#include <vector>
+#include <dpcpp/preconditioner/jacobi_common.hpp>
 
 
 #include <CL/sycl.hpp>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "core/base/allocator.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
-#include "dpcpp/components/matrix_operations.dp.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
 
 
 namespace gko {
@@ -60,185 +57,416 @@ namespace kernels {
 namespace dpcpp {
 /**
  * @brief The Jacobi preconditioner namespace.
- *
+ * @ref Jacobi
  * @ingroup jacobi
  */
 namespace jacobi {
-
-
-void initialize_precisions(std::shared_ptr<const DpcppExecutor> exec,
-                           const array<precision_reduction>& source,
-                           array<precision_reduction>& precisions)
-    GKO_NOT_IMPLEMENTED;
-
-
 namespace {
 
 
-template <typename IndexType>
-inline bool has_same_nonzero_pattern(
-    const IndexType* prev_row_ptr, const IndexType* curr_row_ptr,
-    const IndexType* next_row_ptr) GKO_NOT_IMPLEMENTED;
-
-
-template <typename ValueType, typename IndexType>
-size_type find_natural_blocks(const matrix::Csr<ValueType, IndexType>* mtx,
-                              uint32 max_block_size,
-                              IndexType* block_ptrs) GKO_NOT_IMPLEMENTED;
+// a total of 32 warps (1024 threads)
+constexpr int default_num_warps = 32;
+// with current architectures, at most 32 warps can be scheduled per SM (and
+// current GPUs have at most 84 SMs)
+constexpr int default_grid_size = 32 * 32 * 128;
+
+
+template <int warps_per_block>
+void duplicate_array(const precision_reduction* __restrict__ source,
+                     size_type source_size,
+                     precision_reduction* __restrict__ dest,
+                     size_type dest_size, sycl::nd_item<3> item_ct1)
+{
+    auto grid = group::this_grid(item_ct1);
+    if (grid.thread_rank() >= dest_size) {
+        return;
+    }
+    for (auto i = grid.thread_rank(); i < dest_size; i += grid.size()) {
+        dest[i] = source[i % source_size];
+    }
+}
+
+template <int warps_per_block>
+void duplicate_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
+                     sycl::queue* queue, const precision_reduction* source,
+                     size_type source_size, precision_reduction* dest,
+                     size_type dest_size)
+{
+    queue->parallel_for(sycl_nd_range(grid, block),
+                        [=](sycl::nd_item<3> item_ct1) {
+                            duplicate_array<warps_per_block>(
+                                source, source_size, dest, dest_size, item_ct1);
+                        });
+}
 
 
 template <typename IndexType>
-inline size_type agglomerate_supervariables(
-    uint32 max_block_size, size_type num_natural_blocks,
-    IndexType* block_ptrs) GKO_NOT_IMPLEMENTED;
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void find_blocks(std::shared_ptr<const DpcppExecutor> exec,
-                 const matrix::Csr<ValueType, IndexType>* system_matrix,
-                 uint32 max_block_size, size_type& num_blocks,
-                 array<IndexType>& block_pointers) GKO_NOT_IMPLEMENTED;
+void compare_adjacent_rows(size_type num_rows, int32 max_block_size,
+                           const IndexType* __restrict__ row_ptrs,
+                           const IndexType* __restrict__ col_idx,
+                           bool* __restrict__ matching_next_row,
+                           sycl::nd_item<3> item_ct1)
+{
+    const auto warp = group::tiled_partition<config::warp_size>(
+        group::this_thread_block(item_ct1));
+    const auto local_tid = warp.thread_rank();
+    const auto warp_id =
+        thread::get_subwarp_id_flat<config::warp_size>(item_ct1);
+
+    if (warp_id >= num_rows - 1) {
+        return;
+    }
+
+    const auto curr_row_start = row_ptrs[warp_id];
+    const auto next_row_start = row_ptrs[warp_id + 1];
+    const auto next_row_end = row_ptrs[warp_id + 2];
+
+    const auto nz_this_row = next_row_end - next_row_start;
+    const auto nz_prev_row = next_row_start - curr_row_start;
+
+    if (nz_this_row != nz_prev_row) {
+        matching_next_row[warp_id] = false;
+        return;
+    }
+    size_type steps = ceildiv(nz_this_row, config::warp_size);
+    for (size_type i = 0; i < steps; i++) {
+        auto j = local_tid + i * config::warp_size;
+        auto prev_col = (curr_row_start + j < next_row_start)
+                            ? col_idx[curr_row_start + j]
+                            : 0;
+        auto this_col = (curr_row_start + j < next_row_start)
+                            ? col_idx[next_row_start + j]
+                            : 0;
+        if (warp.any(prev_col != this_col)) {
+            matching_next_row[warp_id] = false;
+            return;
+        }
+    }
+    matching_next_row[warp_id] = true;
+}
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
+template <typename IndexType>
+void compare_adjacent_rows(dim3 grid, dim3 block,
+                           size_type dynamic_shared_memory, sycl::queue* queue,
+                           size_type num_rows, int32 max_block_size,
+                           const IndexType* row_ptrs, const IndexType* col_idx,
+                           bool* matching_next_row)
+{
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            compare_adjacent_rows(num_rows, max_block_size, row_ptrs, col_idx,
+                                  matching_next_row, item_ct1);
+        });
+}
 
 
-namespace {
+template <typename IndexType>
+void generate_natural_block_pointer(size_type num_rows, int32 max_block_size,
+                                    const bool* __restrict__ matching_next_row,
+                                    IndexType* __restrict__ block_ptrs,
+                                    size_type* __restrict__ num_blocks_arr)
+{
+    block_ptrs[0] = 0;
+    if (num_rows == 0) {
+        return;
+    }
+    size_type num_blocks = 1;
+    int32 current_block_size = 1;
+    for (size_type i = 0; i < num_rows - 1; ++i) {
+        if ((matching_next_row[i]) && (current_block_size < max_block_size)) {
+            ++current_block_size;
+        } else {
+            block_ptrs[num_blocks] =
+                block_ptrs[num_blocks - 1] + current_block_size;
+            ++num_blocks;
+            current_block_size = 1;
+        }
+    }
+    block_ptrs[num_blocks] = block_ptrs[num_blocks - 1] + current_block_size;
+    num_blocks_arr[0] = num_blocks;
+}
 
+template <typename IndexType>
+void generate_natural_block_pointer(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    size_type num_rows, int32 max_block_size, const bool* matching_next_row,
+    IndexType* block_ptrs, size_type* num_blocks_arr)
+{
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            generate_natural_block_pointer(num_rows, max_block_size,
+                                           matching_next_row, block_ptrs,
+                                           num_blocks_arr);
+        });
+}
 
-template <typename ValueType, typename IndexType>
-inline void extract_block(const matrix::Csr<ValueType, IndexType>* mtx,
-                          IndexType block_size, IndexType block_start,
-                          ValueType* block,
-                          size_type stride) GKO_NOT_IMPLEMENTED;
 
+template <typename IndexType>
+void agglomerate_supervariables_kernel(int32 max_block_size,
+                                       size_type num_natural_blocks,
+                                       IndexType* __restrict__ block_ptrs,
+                                       size_type* __restrict__ num_blocks_arr)
+{
+    num_blocks_arr[0] = 0;
+    if (num_natural_blocks == 0) {
+        return;
+    }
+    size_type num_blocks = 1;
+    int32 current_block_size = block_ptrs[1] - block_ptrs[0];
+    for (size_type i = 1; i < num_natural_blocks; ++i) {
+        const int32 block_size = block_ptrs[i + 1] - block_ptrs[i];
+        if (current_block_size + block_size <= max_block_size) {
+            current_block_size += block_size;
+        } else {
+            block_ptrs[num_blocks] = block_ptrs[i];
+            ++num_blocks;
+            current_block_size = block_size;
+        }
+    }
+    block_ptrs[num_blocks] = block_ptrs[num_natural_blocks];
+    num_blocks_arr[0] = num_blocks;
+}
 
-template <typename ValueType, typename IndexType>
-inline IndexType choose_pivot(IndexType block_size, const ValueType* block,
-                              size_type stride) GKO_NOT_IMPLEMENTED;
+template <typename IndexType>
+void agglomerate_supervariables_kernel(dim3 grid, dim3 block,
+                                       size_type dynamic_shared_memory,
+                                       sycl::queue* queue, int32 max_block_size,
+                                       size_type num_natural_blocks,
+                                       IndexType* block_ptrs,
+                                       size_type* num_blocks_arr)
+{
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            agglomerate_supervariables_kernel(
+                max_block_size, num_natural_blocks, block_ptrs, num_blocks_arr);
+        });
+}
+
+
+template <bool conjugate, int max_block_size, int subwarp_size,
+          int warps_per_block, typename ValueType, typename IndexType>
+void transpose_jacobi(
+    const ValueType* __restrict__ blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
+    ValueType* __restrict__ out_blocks, sycl::nd_item<3> item_ct1)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>(item_ct1);
+    const auto subwarp = group::tiled_partition<subwarp_size>(
+        group::this_thread_block(item_ct1));
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+
+    const auto block_ofs = storage_scheme.get_global_block_offset(block_id);
+    const auto block_stride = storage_scheme.get_stride();
+    const auto rank = subwarp.thread_rank();
+    if (rank < block_size) {
+        for (IndexType i = 0; i < block_size; ++i) {
+            auto val = blocks[block_ofs + i * block_stride + rank];
+            /*
+            DPCT1007:4: Migration of this CUDA API is not supported by the
+            Intel(R) DPC++ Compatibility Tool.
+            */
+            out_blocks[block_ofs + i + rank * block_stride] =
+                conjugate ? conj(val) : val;
+        }
+    }
+}
+
+template <bool conjugate, int max_block_size, int subwarp_size,
+          int warps_per_block, typename ValueType, typename IndexType>
+void transpose_jacobi(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    const ValueType* blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const IndexType* block_ptrs, size_type num_blocks, ValueType* out_blocks)
+{
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            transpose_jacobi<conjugate, max_block_size, subwarp_size,
+                             warps_per_block>(blocks, storage_scheme,
+                                              block_ptrs, num_blocks,
+                                              out_blocks, item_ct1);
+        });
+}
+
+
+template <bool conjugate, int max_block_size, int subwarp_size,
+          int warps_per_block, typename ValueType, typename IndexType>
+void adaptive_transpose_jacobi(
+    const ValueType* __restrict__ blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const precision_reduction* __restrict__ block_precisions,
+    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
+    ValueType* __restrict__ out_blocks, sycl::nd_item<3> item_ct1)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>(item_ct1);
+    const auto subwarp = group::tiled_partition<subwarp_size>(
+        group::this_thread_block(item_ct1));
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+
+    const auto block_stride = storage_scheme.get_stride();
+    const auto rank = subwarp.thread_rank();
+    if (rank < block_size) {
+        /*
+        DPCT1007:6: Migration of this CUDA API is not supported by the Intel(R)
+        DPC++ Compatibility Tool.
+        */
+        GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+            ValueType, block_precisions[block_id],
+            auto local_block =
+                reinterpret_cast<const resolved_precision*>(
+                    blocks + storage_scheme.get_group_offset(block_id)) +
+                storage_scheme.get_block_offset(block_id);
+            auto local_out_block =
+                reinterpret_cast<resolved_precision*>(
+                    out_blocks + storage_scheme.get_group_offset(block_id)) +
+                storage_scheme.get_block_offset(block_id);
+            for (IndexType i = 0; i < block_size; ++i) {
+                auto val = local_block[i * block_stride + rank];
+                local_out_block[i + rank * block_stride] =
+                    conjugate ? conj(val) : val;
+            });
+    }
+}
+
+template <bool conjugate, int max_block_size, int subwarp_size,
+          int warps_per_block, typename ValueType, typename IndexType>
+void adaptive_transpose_jacobi(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    const ValueType* blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const precision_reduction* block_precisions, const IndexType* block_ptrs,
+    size_type num_blocks, ValueType* out_blocks)
+{
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            adaptive_transpose_jacobi<conjugate, max_block_size, subwarp_size,
+                                      warps_per_block>(
+                blocks, storage_scheme, block_precisions, block_ptrs,
+                num_blocks, out_blocks, item_ct1);
+        });
+}
 
 
 template <typename ValueType, typename IndexType>
-inline void swap_rows(IndexType row1, IndexType row2, IndexType block_size,
-                      ValueType* block, size_type stride) GKO_NOT_IMPLEMENTED;
+size_type find_natural_blocks(std::shared_ptr<const DefaultExecutor> exec,
+                              const matrix::Csr<ValueType, IndexType>* mtx,
+                              int32 max_block_size,
+                              IndexType* __restrict__ block_ptrs)
+{
+    Array<size_type> nums(exec, 1);
+
+    Array<bool> matching_next_row(exec, mtx->get_size()[0] - 1);
+
+    const dim3 block_size(config::warp_size, 1, 1);
+    const dim3 grid_size(
+        ceildiv(mtx->get_size()[0] * config::warp_size, block_size.x), 1, 1);
+    compare_adjacent_rows(grid_size, block_size, 0, exec->get_queue(),
+                          mtx->get_size()[0], max_block_size,
+                          mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+                          matching_next_row.get_data());
+    generate_natural_block_pointer(
+        1, 1, 0, exec->get_queue(), mtx->get_size()[0], max_block_size,
+        matching_next_row.get_const_data(), block_ptrs, nums.get_data());
+    nums.set_executor(exec->get_master());
+    return nums.get_const_data()[0];
+}
 
 
-template <typename ValueType, typename IndexType>
-inline bool apply_gauss_jordan_transform(IndexType row, IndexType col,
-                                         IndexType block_size, ValueType* block,
-                                         size_type stride) GKO_NOT_IMPLEMENTED;
-
-
-template <typename SourceValueType, typename ResultValueType,
-          typename IndexType,
-          typename ValueConverter =
-              default_converter<SourceValueType, ResultValueType>>
-inline void transpose_block(
-    IndexType block_size, const SourceValueType* from, size_type from_stride,
-    ResultValueType* to, size_type to_stride,
-    ValueConverter converter = {}) noexcept GKO_NOT_IMPLEMENTED;
-
-
-template <typename SourceValueType, typename ResultValueType,
-          typename IndexType,
-          typename ValueConverter =
-              default_converter<SourceValueType, ResultValueType>>
-inline void conj_transpose_block(
-    IndexType block_size, const SourceValueType* from, size_type from_stride,
-    ResultValueType* to, size_type to_stride,
-    ValueConverter converter = {}) noexcept GKO_NOT_IMPLEMENTED;
-
-
-template <typename SourceValueType, typename ResultValueType,
-          typename IndexType,
-          typename ValueConverter =
-              default_converter<SourceValueType, ResultValueType>>
-inline void permute_and_transpose_block(
-    IndexType block_size, const IndexType* col_perm,
-    const SourceValueType* source, size_type source_stride,
-    ResultValueType* result, size_type result_stride,
-    ValueConverter converter = {}) GKO_NOT_IMPLEMENTED;
+template <typename IndexType>
+inline size_type agglomerate_supervariables(
+    std::shared_ptr<const DefaultExecutor> exec, int32 max_block_size,
+    size_type num_natural_blocks, IndexType* block_ptrs)
+{
+    Array<size_type> nums(exec, 1);
 
+    agglomerate_supervariables_kernel(1, 1, 0, exec->get_queue(),
+                                      max_block_size, num_natural_blocks,
+                                      block_ptrs, nums.get_data());
 
-template <typename ValueType, typename IndexType>
-inline bool invert_block(IndexType block_size, IndexType* perm,
-                         ValueType* block,
-                         size_type stride) GKO_NOT_IMPLEMENTED;
+    nums.set_executor(exec->get_master());
+    return nums.get_const_data()[0];
+}
 
 
-template <typename ReducedType, typename ValueType, typename IndexType>
-inline bool validate_precision_reduction_feasibility(
-    std::shared_ptr<const DpcppExecutor> exec, IndexType block_size,
-    const ValueType* block, size_type stride) GKO_NOT_IMPLEMENTED;
+}  // namespace
 
 
-}  // namespace
+void initialize_precisions(std::shared_ptr<const DefaultExecutor> exec,
+                           const Array<precision_reduction>& source,
+                           Array<precision_reduction>& precisions)
+{
+    const auto block_size = default_num_warps * config::warp_size;
+    const auto grid_size = min(
+        default_grid_size,
+        static_cast<int32>(ceildiv(precisions.get_num_elems(), block_size)));
+    duplicate_array<default_num_warps>(
+        grid_size, block_size, 0, exec->get_queue(), source.get_const_data(),
+        source.get_num_elems(), precisions.get_data(),
+        precisions.get_num_elems());
+}
 
 
 template <typename ValueType, typename IndexType>
-void generate(std::shared_ptr<const DpcppExecutor> exec,
-              const matrix::Csr<ValueType, IndexType>* system_matrix,
-              size_type num_blocks, uint32 max_block_size,
-              remove_complex<ValueType> accuracy,
-              const preconditioner::block_interleaved_storage_scheme<IndexType>&
-                  storage_scheme,
-              array<remove_complex<ValueType>>& conditioning,
-              array<precision_reduction>& block_precisions,
-              const array<IndexType>& block_pointers,
-              array<ValueType>& blocks) GKO_NOT_IMPLEMENTED;
+void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Csr<ValueType, IndexType>* system_matrix,
+                 uint32 max_block_size, size_type& num_blocks,
+                 Array<IndexType>& block_pointers)
+{
+    auto num_natural_blocks = find_natural_blocks(
+        exec, system_matrix, max_block_size, block_pointers.get_data());
+    num_blocks = agglomerate_supervariables(
+        exec, max_block_size, num_natural_blocks, block_pointers.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_JACOBI_GENERATE_KERNEL);
+    GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);
 
 
 namespace {
 
 
-template <
-    typename ValueType, typename BlockValueType,
-    typename ValueConverter = default_converter<BlockValueType, ValueType>>
-inline void apply_block(size_type block_size, size_type num_rhs,
-                        const BlockValueType* block, size_type stride,
-                        ValueType alpha, const ValueType* b, size_type stride_b,
-                        ValueType beta, ValueType* x, size_type stride_x,
-                        ValueConverter converter = {}) GKO_NOT_IMPLEMENTED;
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void apply(std::shared_ptr<const DpcppExecutor> exec, size_type num_blocks,
-           uint32 max_block_size,
-           const preconditioner::block_interleaved_storage_scheme<IndexType>&
-               storage_scheme,
-           const array<precision_reduction>& block_precisions,
-           const array<IndexType>& block_pointers,
-           const array<ValueType>& blocks,
-           const matrix::Dense<ValueType>* alpha,
-           const matrix::Dense<ValueType>* b,
-           const matrix::Dense<ValueType>* beta,
-           matrix::Dense<ValueType>* x) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void simple_apply(
-    std::shared_ptr<const DpcppExecutor> exec, size_type num_blocks,
-    uint32 max_block_size,
+template <bool conjugate, int warps_per_block, int max_block_size,
+          typename ValueType, typename IndexType>
+void transpose_jacobi(
+    syn::value_list<int, max_block_size>, size_type num_blocks,
+    const precision_reduction* block_precisions,
+    const IndexType* block_pointers, const ValueType* blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
         storage_scheme,
-    const array<precision_reduction>& block_precisions,
-    const array<IndexType>& block_pointers, const array<ValueType>& blocks,
-    const matrix::Dense<ValueType>* b,
-    matrix::Dense<ValueType>* x) GKO_NOT_IMPLEMENTED;
+    ValueType* out_blocks)
+{
+    constexpr int subwarp_size = get_larger_power(max_block_size);
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
+    const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
+                         1, 1);
+    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
+
+    if (block_precisions) {
+        adaptive_transpose_jacobi<conjugate, max_block_size, subwarp_size,
+                                  warps_per_block>(
+            grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme,
+            block_precisions, block_pointers, num_blocks, out_blocks);
+    } else {
+        transpose_jacobi<conjugate, max_block_size, subwarp_size,
+                         warps_per_block>(
+            grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme,
+            block_pointers, num_blocks, out_blocks);
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_jacobi, transpose_jacobi);
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
+
+}  // namespace
 
 
 template <typename ValueType, typename IndexType>
@@ -248,7 +476,18 @@ void transpose_jacobi(
     const array<IndexType>& block_pointers, const array<ValueType>& blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
         storage_scheme,
-    array<ValueType>& out_blocks) GKO_NOT_IMPLEMENTED;
+    Array<ValueType>& out_blocks)
+{
+    select_transpose_jacobi(
+        compiled_kernels(),
+        [&](int compiled_block_size) {
+            return max_block_size <= compiled_block_size;
+        },
+        syn::value_list<int, false, config::min_warps_per_block>(),
+        syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+        block_pointers.get_const_data(), blocks.get_const_data(),
+        storage_scheme, out_blocks.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);
@@ -261,7 +500,18 @@ void conj_transpose_jacobi(
     const array<IndexType>& block_pointers, const array<ValueType>& blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
         storage_scheme,
-    array<ValueType>& out_blocks) GKO_NOT_IMPLEMENTED;
+    Array<ValueType>& out_blocks)
+{
+    select_transpose_jacobi(
+        compiled_kernels(),
+        [&](int compiled_block_size) {
+            return max_block_size <= compiled_block_size;
+        },
+        syn::value_list<int, true, config::min_warps_per_block>(),
+        syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+        block_pointers.get_const_data(), blocks.get_const_data(),
+        storage_scheme, out_blocks.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);
@@ -269,9 +519,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(
-    std::shared_ptr<const DpcppExecutor> exec, size_type num_blocks,
-    const array<precision_reduction>& block_precisions,
-    const array<IndexType>& block_pointers, const array<ValueType>& blocks,
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
+    const Array<precision_reduction>& block_precisions,
+    const Array<IndexType>& block_pointers, const Array<ValueType>& blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
         storage_scheme,
     ValueType* result_values, size_type result_stride) GKO_NOT_IMPLEMENTED;
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
new file mode 100644
index 00000000000..9edeb6f552c
--- /dev/null
+++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
@@ -0,0 +1,229 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <dpcpp/preconditioner/jacobi_common.hpp>
+
+
+#include <CL/sycl.hpp>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "core/base/extended_float.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/components/warp_blas.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+namespace kernel {
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+void apply(
+    const ValueType* __restrict__ blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
+    const ValueType* __restrict__ b, int32 b_stride, ValueType* __restrict__ x,
+    int32 x_stride, sycl::nd_item<3> item_ct1)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>(item_ct1);
+    const auto subwarp = group::tiled_partition<subwarp_size>(
+        group::this_thread_block(item_ct1));
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+    ValueType v = zero<ValueType>();
+    if (subwarp.thread_rank() < block_size) {
+        v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
+    }
+    multiply_vec<max_block_size>(
+        subwarp, block_size, v,
+        blocks + storage_scheme.get_global_block_offset(block_id) +
+            subwarp.thread_rank(),
+        storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
+        x_stride,
+        [](ValueType& result, const ValueType& out) { result = out; });
+}
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+void apply(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    const ValueType* blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const IndexType* block_ptrs, size_type num_blocks, const ValueType* b,
+    int32 b_stride, ValueType* x, int32 x_stride)
+{
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            apply<max_block_size, subwarp_size, warps_per_block>(
+                blocks, storage_scheme, block_ptrs, num_blocks, b, b_stride, x,
+                x_stride, item_ct1);
+        });
+}
+
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+void adaptive_apply(
+    const ValueType* __restrict__ blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const precision_reduction* __restrict__ block_precisions,
+    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
+    const ValueType* __restrict__ b, int32 b_stride, ValueType* __restrict__ x,
+    int32 x_stride, sycl::nd_item<3> item_ct1)
+{
+    const auto block_id =
+        thread::get_subwarp_id<subwarp_size, warps_per_block>(item_ct1);
+    const auto subwarp = group::tiled_partition<subwarp_size>(
+        group::this_thread_block(item_ct1));
+    if (block_id >= num_blocks) {
+        return;
+    }
+    const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
+    ValueType v = zero<ValueType>();
+    if (subwarp.thread_rank() < block_size) {
+        v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride];
+    }
+    GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+        ValueType, block_precisions[block_id],
+        multiply_vec<max_block_size>(
+            subwarp, block_size, v,
+            reinterpret_cast<const resolved_precision*>(
+                blocks + storage_scheme.get_group_offset(block_id)) +
+                storage_scheme.get_block_offset(block_id) +
+                subwarp.thread_rank(),
+            storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride,
+            x_stride,
+            [](ValueType& result, const ValueType& out) { result = out; }));
+}
+
+template <int max_block_size, int subwarp_size, int warps_per_block,
+          typename ValueType, typename IndexType>
+void adaptive_apply(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    const ValueType* blocks,
+    preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
+    const precision_reduction* block_precisions, const IndexType* block_ptrs,
+    size_type num_blocks, const ValueType* b, int32 b_stride, ValueType* x,
+    int32 x_stride)
+{
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            adaptive_apply<max_block_size, subwarp_size, warps_per_block>(
+                blocks, storage_scheme, block_precisions, block_ptrs,
+                num_blocks, b, b_stride, x, x_stride, item_ct1);
+        });
+}
+
+
+}  // namespace kernel
+
+
+// clang-format off
+//#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
+// clang-format on
+// make things easier for IDEs
+#ifndef GKO_JACOBI_BLOCK_SIZE
+#define GKO_JACOBI_BLOCK_SIZE 1
+#endif
+
+
+template <int warps_per_block, int max_block_size, typename ValueType,
+          typename IndexType>
+void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
+           const precision_reduction* block_precisions,
+           const IndexType* block_pointers, const ValueType* blocks,
+           const preconditioner::block_interleaved_storage_scheme<IndexType>&
+               storage_scheme,
+           const ValueType* b, size_type b_stride, ValueType* x,
+           size_type x_stride)
+{
+    constexpr int subwarp_size = get_larger_power(max_block_size);
+    constexpr int blocks_per_warp = config::warp_size / subwarp_size;
+    const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
+                         1, 1);
+    const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);
+
+    if (block_precisions) {
+        kernel::adaptive_apply<max_block_size, subwarp_size, warps_per_block>(
+            grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme,
+            block_precisions, block_pointers, num_blocks, b, b_stride, x,
+            x_stride);
+    } else {
+        kernel::apply<max_block_size, subwarp_size, warps_per_block>(
+            grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme,
+            block_pointers, num_blocks, b, b_stride, x, x_stride);
+    }
+}
+
+
+#define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType)       \
+    void apply<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE, ValueType, \
+               IndexType>(                                                    \
+        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>, size_type,               \
+        const precision_reduction*, const IndexType*, const ValueType*,       \
+        const preconditioner::block_interleaved_storage_scheme<IndexType>&,   \
+        const ValueType*, size_type, ValueType*, size_type)
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION);
+
+
+}  // namespace jacobi
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
new file mode 100644
index 00000000000..d5ee2fbf92a
--- /dev/null
+++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
@@ -0,0 +1,102 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <dpcpp/preconditioner/jacobi_common.hpp>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+template <int warps_per_block, int max_block_size, typename ValueType,
+          typename IndexType>
+void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
+           const precision_reduction* block_precisions,
+           const IndexType* block_pointers, const ValueType* blocks,
+           const preconditioner::block_interleaved_storage_scheme<IndexType>&
+               storage_scheme,
+           const ValueType* b, size_type b_stride, ValueType* x,
+           size_type x_stride);
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_apply, apply);
+
+
+template <typename ValueType, typename IndexType>
+void simple_apply(
+    std::shared_ptr<const DpcppExecutor> exec, size_type num_blocks,
+    uint32 max_block_size,
+    const preconditioner::block_interleaved_storage_scheme<IndexType>&
+        storage_scheme,
+    const Array<precision_reduction>& block_precisions,
+    const Array<IndexType>& block_pointers, const Array<ValueType>& blocks,
+    const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* x)
+{
+    // TODO: write a special kernel for multiple RHS
+    for (size_type col = 0; col < b->get_size()[1]; ++col) {
+        select_apply(
+            compiled_kernels(),
+            [&](int compiled_block_size) {
+                return max_block_size <= compiled_block_size;
+            },
+            syn::value_list<int, config::min_warps_per_block>(),
+            syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+            block_pointers.get_const_data(), blocks.get_const_data(),
+            storage_scheme, b->get_const_values() + col, b->get_stride(),
+            x->get_values() + col, x->get_stride());
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
+
+
+}  // namespace jacobi
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko

From e3ea1faef5d51a52a434278ef683c1e78c1176a3 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 24 Nov 2021 18:02:36 +0800
Subject: [PATCH 02/11] script port

---
 .../diagonal_block_manipulation.dp.hpp        | 128 ++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 dpcpp/components/diagonal_block_manipulation.dp.hpp

diff --git a/dpcpp/components/diagonal_block_manipulation.dp.hpp b/dpcpp/components/diagonal_block_manipulation.dp.hpp
new file mode 100644
index 00000000000..32e3e26739c
--- /dev/null
+++ b/dpcpp/components/diagonal_block_manipulation.dp.hpp
@@ -0,0 +1,128 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_DP_HPP_
+#define GKO_DPCPP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_DP_HPP_
+
+
+#include <type_traits>
+
+
+#include <CL/sycl.hpp>
+
+
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace csr {
+
+
+/**
+ * @internal
+ *
+ * @note assumes that block dimensions are in "standard format":
+ *       (subwarp_size, config::warp_size / subwarp_size, z)
+ */
+template <
+    int max_block_size, int warps_per_block, typename Group, typename ValueType,
+    typename IndexType,
+    typename = std::enable_if_t<group::is_synchronizable_group<Group>::value>>
+__dpct_inline__ void extract_transposed_diag_blocks(
+    const Group& group, int processed_blocks,
+    const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs,
+    const ValueType* __restrict__ values,
+    const IndexType* __restrict__ block_ptrs, size_type num_blocks,
+    ValueType* __restrict__ block_row, int increment,
+    ValueType* __restrict__ workspace, sycl::nd_item<3> item_ct1)
+{
+    const int tid =
+        item_ct1.get_local_id(1) * item_ct1.get_local_range().get(2) +
+        item_ct1.get_local_id(2);
+    const auto warp = group::tiled_partition<config::warp_size>(group);
+    auto bid = static_cast<size_type>(item_ct1.get_group(2)) * warps_per_block *
+                   processed_blocks +
+               item_ct1.get_local_id(0) * processed_blocks;
+    auto bstart = (bid < num_blocks) ? block_ptrs[bid] : zero<IndexType>();
+    IndexType bsize = 0;
+#pragma unroll
+    for (int b = 0; b < processed_blocks; ++b, ++bid) {
+        if (bid < num_blocks) {
+            bstart += bsize;
+            bsize = block_ptrs[bid + 1] - bstart;
+#pragma unroll
+            for (int i = 0; i < max_block_size; ++i) {
+                if (i < bsize) {
+                    if (item_ct1.get_local_id(1) == b &&
+                        item_ct1.get_local_id(2) < max_block_size) {
+                        workspace[item_ct1.get_local_id(2)] = zero<ValueType>();
+                    }
+                    warp.sync();
+                    const auto row = bstart + i;
+                    const auto rstart = row_ptrs[row] + tid;
+                    const auto rend = row_ptrs[row + 1];
+                    // use the entire warp to ensure coalesced memory access
+                    for (auto j = rstart; j < rend; j += config::warp_size) {
+                        const auto col = col_idxs[j] - bstart;
+                        if (col >= bsize) {
+                            break;
+                        }
+                        if (col >= 0) {
+                            workspace[col] = values[j];
+                        }
+                    }
+                    warp.sync();
+                    if (item_ct1.get_local_id(1) == b &&
+                        item_ct1.get_local_id(2) < bsize) {
+                        block_row[i * increment] =
+                            workspace[item_ct1.get_local_id(2)];
+                    }
+                    warp.sync();
+                }
+            }
+        }
+    }
+}
+
+
+}  // namespace csr
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_DP_HPP_

From abc1eeb38bf14c73849431da48a1592246d62506 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 24 Nov 2021 23:02:05 +0800
Subject: [PATCH 03/11] test script port

---
 dpcpp/test/CMakeLists.txt                    |   1 +
 dpcpp/test/preconditioner/CMakeLists.txt     |   1 +
 dpcpp/test/preconditioner/jacobi_kernels.cpp | 900 +++++++++++++++++++
 3 files changed, 902 insertions(+)
 create mode 100644 dpcpp/test/preconditioner/CMakeLists.txt
 create mode 100644 dpcpp/test/preconditioner/jacobi_kernels.cpp

diff --git a/dpcpp/test/CMakeLists.txt b/dpcpp/test/CMakeLists.txt
index 00be1e71e35..b882f57715e 100644
--- a/dpcpp/test/CMakeLists.txt
+++ b/dpcpp/test/CMakeLists.txt
@@ -4,3 +4,4 @@ set(GINKGO_COMPILING_DPCPP_TEST ON)
 add_subdirectory(base)
 add_subdirectory(components)
 add_subdirectory(matrix)
+add_subdirectory(preconditioner)
diff --git a/dpcpp/test/preconditioner/CMakeLists.txt b/dpcpp/test/preconditioner/CMakeLists.txt
new file mode 100644
index 00000000000..a0ca5a2e38a
--- /dev/null
+++ b/dpcpp/test/preconditioner/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_test(jacobi_kernels)
diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp
new file mode 100644
index 00000000000..0302ae7c24e
--- /dev/null
+++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp
@@ -0,0 +1,900 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/preconditioner/jacobi.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/test/utils.hpp"
+#include "core/test/utils/unsort_matrix.hpp"
+
+
+namespace {
+
+
+class Jacobi : public ::testing::Test {
+protected:
+    using Bj = gko::preconditioner::Jacobi<>;
+    using Mtx = gko::matrix::Csr<>;
+    using Vec = gko::matrix::Dense<>;
+    using mtx_data = gko::matrix_data<>;
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::DpcppExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        dpcpp = gko::DpcppExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (dpcpp != nullptr) {
+            ASSERT_NO_THROW(dpcpp->synchronize());
+        }
+    }
+
+    void initialize_data(
+        std::initializer_list<gko::int32> block_pointers,
+        std::initializer_list<gko::precision_reduction> block_precisions,
+        std::initializer_list<double> condition_numbers,
+        gko::uint32 max_block_size, int min_nnz, int max_nnz, int num_rhs = 1,
+        double accuracy = 0.1, bool skip_sorting = true)
+    {
+        std::ranlux48 engine(42);
+        const auto dim = *(end(block_pointers) - 1);
+        if (condition_numbers.size() == 0) {
+            mtx = gko::test::generate_random_matrix<Mtx>(
+                dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz),
+                std::normal_distribution<>(0.0, 1.0), engine, ref);
+        } else {
+            std::vector<mtx_data> blocks;
+            for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) {
+                const auto size =
+                    begin(block_pointers)[i + 1] - begin(block_pointers)[i];
+                const auto cond = begin(condition_numbers)[i];
+                blocks.push_back(mtx_data::cond(
+                    size, cond, std::normal_distribution<>(-1, 1), engine));
+            }
+            mtx = Mtx::create(ref);
+            mtx->read(mtx_data::diag(begin(blocks), end(blocks)));
+        }
+        gko::Array<gko::int32> block_ptrs(ref, block_pointers);
+        gko::Array<gko::precision_reduction> block_prec(ref, block_precisions);
+        if (block_prec.get_num_elems() == 0) {
+            bj_factory = Bj::build()
+                             .with_max_block_size(max_block_size)
+                             .with_block_pointers(block_ptrs)
+                             .with_skip_sorting(skip_sorting)
+                             .on(ref);
+            d_bj_factory = Bj::build()
+                               .with_max_block_size(max_block_size)
+                               .with_block_pointers(block_ptrs)
+                               .with_skip_sorting(skip_sorting)
+                               .on(dpcpp);
+        } else {
+            bj_factory = Bj::build()
+                             .with_max_block_size(max_block_size)
+                             .with_block_pointers(block_ptrs)
+                             .with_storage_optimization(block_prec)
+                             .with_accuracy(accuracy)
+                             .with_skip_sorting(skip_sorting)
+                             .on(ref);
+            d_bj_factory = Bj::build()
+                               .with_max_block_size(max_block_size)
+                               .with_block_pointers(block_ptrs)
+                               .with_storage_optimization(block_prec)
+                               .with_accuracy(accuracy)
+                               .with_skip_sorting(skip_sorting)
+                               .on(dpcpp);
+        }
+        b = gko::test::generate_random_matrix<Vec>(
+            dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs),
+            std::normal_distribution<>(0.0, 1.0), engine, ref);
+        d_b = gko::clone(dpcpp, b);
+        x = gko::test::generate_random_matrix<Vec>(
+            dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs),
+            std::normal_distribution<>(0.0, 1.0), engine, ref);
+        d_x = gko::clone(dpcpp, x);
+    }
+
+    const gko::precision_reduction dp{};
+    const gko::precision_reduction sp{0, 1};
+    const gko::precision_reduction hp{0, 2};
+    const gko::precision_reduction tp{1, 0};
+    const gko::precision_reduction qp{2, 0};
+    const gko::precision_reduction up{1, 1};
+    const gko::precision_reduction ap{gko::precision_reduction::autodetect()};
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::DpcppExecutor> dpcpp;
+    std::shared_ptr<Mtx> mtx;
+    std::unique_ptr<Vec> x;
+    std::unique_ptr<Vec> b;
+    std::unique_ptr<Vec> d_x;
+    std::unique_ptr<Vec> d_b;
+
+    std::unique_ptr<Bj::Factory> bj_factory;
+    std::unique_ptr<Bj::Factory> d_bj_factory;
+};
+
+
+TEST_F(Jacobi, DpcppFindNaturalBlocksEquivalentToRef)
+{
+    /* example matrix:
+        1   1
+        1   1
+        1       1
+        1       1
+     */
+    auto mtx = share(Mtx::create(ref));
+    mtx->read({{4, 4},
+               {{0, 0, 1.0},
+                {0, 1, 1.0},
+                {1, 0, 1.0},
+                {1, 1, 1.0},
+                {2, 0, 1.0},
+                {2, 2, 1.0},
+                {3, 0, 1.0},
+                {3, 2, 1.0}}});
+
+    auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx);
+    auto d_bj = Bj::build().with_max_block_size(3u).on(dpcpp)->generate(mtx);
+
+    ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks());
+    // TODO: actually check if the results are the same
+}
+
+
+TEST_F(Jacobi, DpcppExecutesSupervariableAgglomerationEquivalentToRef)
+{
+    /* example matrix:
+        1   1
+        1   1
+                1   1
+                1   1
+                        1
+     */
+    auto mtx = share(Mtx::create(ref));
+    mtx->read({{5, 5},
+               {{0, 0, 1.0},
+                {0, 1, 1.0},
+                {1, 0, 1.0},
+                {1, 1, 1.0},
+                {2, 2, 1.0},
+                {2, 3, 1.0},
+                {3, 2, 1.0},
+                {3, 3, 1.0},
+                {4, 4, 1.0}}});
+
+    auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx);
+    auto d_bj = Bj::build().with_max_block_size(3u).on(dpcpp)->generate(mtx);
+
+    ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks());
+    // TODO: actually check if the results are the same
+}
+
+
+TEST_F(Jacobi, DpcppFindNaturalBlocksInLargeMatrixEquivalentToRef)
+{
+    /* example matrix:
+        1   1
+        1   1
+        1       1
+        1       1
+        1       1
+        1       1
+     */
+    using data = gko::matrix_data<double, int>;
+    auto mtx = share(Mtx::create(ref));
+    mtx->read(data::diag({550, 550}, {{1.0, 1.0, 0.0, 0.0, 0.0, 0.0},
+                                      {1.0, 1.0, 0.0, 0.0, 0.0, 0.0},
+                                      {1.0, 0.0, 1.0, 0.0, 0.0, 0.0},
+                                      {1.0, 0.0, 1.0, 0.0, 0.0, 0.0},
+                                      {1.0, 0.0, 1.0, 0.0, 0.0, 0.0},
+                                      {1.0, 0.0, 1.0, 0.0, 0.0, 0.0}}));
+
+    auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx);
+    auto d_bj = Bj::build().with_max_block_size(3u).on(dpcpp)->generate(mtx);
+
+    ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks());
+    // TODO: actually check if the results are the same
+}
+
+
+TEST_F(Jacobi,
+       DpcppExecutesSupervariableAgglomerationInLargeMatrixEquivalentToRef)
+{
+    /* example matrix:
+        1   1
+        1   1
+                1   1
+                1   1
+                        1
+     */
+    using data = gko::matrix_data<double, int>;
+    auto mtx = share(Mtx::create(ref));
+    mtx->read(data::diag({550, 550}, {{1.0, 1.0, 0.0, 0.0, 0.0},
+                                      {1.0, 1.0, 0.0, 0.0, 0.0},
+                                      {0.0, 0.0, 1.0, 1.0, 0.0},
+                                      {0.0, 0.0, 1.0, 1.0, 0.0},
+                                      {0.0, 0.0, 0.0, 0.0, 1.0}}));
+
+    auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx);
+    auto d_bj = Bj::build().with_max_block_size(3u).on(dpcpp)->generate(mtx);
+
+    ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks());
+    // TODO: actually check if the results are the same
+}
+
+
+TEST_F(Jacobi,
+       DpcppExecutesSupervarAgglomerationEquivalentToRefFor150NonzerowsPerRow)
+{
+    /* example matrix duplicated 50 times:
+        1   1       1
+        1   1       1
+        1       1   1
+        1       1   1
+                1        1
+     */
+    using data = gko::matrix_data<double, int>;
+    auto mtx = share(Mtx::create(ref));
+    mtx->read({{50, 50},
+               {{1.0, 1.0, 0.0, 1.0, 0.0},
+                {1.0, 1.0, 0.0, 1.0, 0.0},
+                {1.0, 0.0, 1.0, 1.0, 0.0},
+                {1.0, 0.0, 1.0, 1.0, 0.0},
+                {0.0, 0.0, 1.0, 0.0, 1.0}}});
+
+
+    auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx);
+    auto d_bj = Bj::build().with_max_block_size(3u).on(dpcpp)->generate(mtx);
+
+    ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks());
+    // TODO: actually check if the results are the same
+}
+
+
+TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Sorted)
+{
+    initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 110);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
+}
+
+
+TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Unsorted)
+{
+    std::ranlux48 engine(42);
+    initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 110, 1, 0.1, false);
+    gko::test::unsort_matrix(mtx.get(), engine);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
+}
+
+
+TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithDifferentBlockSize)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32,
+                    97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
+}
+
+
+TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithMPW)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
+}
+
+
+TEST_F(Jacobi, DpcppTransposedPreconditionerEquivalentToRefWithMPW)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->transpose()),
+                        gko::as<Bj>(bj->transpose()), 1e-14);
+}
+
+
+TEST_F(Jacobi, DpcppConjTransposedPreconditionerEquivalentToRefWithMPW)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->conj_transpose()),
+                        gko::as<Bj>(bj->conj_transpose()), 1e-14);
+}
+
+
+TEST_F(Jacobi, DpcppApplyEquivalentToRefWithBlockSize32)
+{
+    initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 111);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, DpcppApplyEquivalentToRefWithDifferentBlockSize)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32,
+                    97, 99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, DpcppApplyEquivalentToRef)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef)
+{
+    gko::size_type dim = 313;
+    std::ranlux48 engine(42);
+    auto dense_smtx = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, dim, std::uniform_int_distribution<>(1, dim),
+        std::normal_distribution<>(1.0, 2.0), engine, ref));
+    gko::test::make_diag_dominant(dense_smtx.get());
+    auto smtx = gko::share(Mtx::create(ref));
+    smtx->copy_from(dense_smtx.get());
+    auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, 3, std::uniform_int_distribution<>(1, 1),
+        std::normal_distribution<>(0.0, 1.0), engine, ref));
+    auto sx = Vec::create(ref, sb->get_size());
+
+    auto d_smtx = gko::share(Mtx::create(dpcpp));
+    auto d_sb = gko::share(Vec::create(dpcpp));
+    auto d_sx = gko::share(Vec::create(dpcpp, sb->get_size()));
+    d_smtx->copy_from(smtx.get());
+    d_sb->copy_from(sb.get());
+
+    auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx);
+    auto d_sj = Bj::build().with_max_block_size(1u).on(dpcpp)->generate(d_smtx);
+
+    sj->apply(sb.get(), sx.get());
+    d_sj->apply(d_sb.get(), d_sx.get());
+
+    GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12);
+}
+
+
+TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRef)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99);
+    auto alpha = gko::initialize<Vec>({2.0}, ref);
+    auto d_alpha = gko::initialize<Vec>({2.0}, dpcpp);
+    auto beta = gko::initialize<Vec>({-1.0}, ref);
+    auto d_beta = gko::initialize<Vec>({-1.0}, dpcpp);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(alpha.get(), b.get(), beta.get(), x.get());
+    d_bj->apply(d_alpha.get(), d_b.get(), d_beta.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef)
+{
+    gko::size_type dim = 313;
+    std::ranlux48 engine(42);
+    auto dense_smtx = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, dim, std::uniform_int_distribution<>(1, dim),
+        std::normal_distribution<>(1.0, 2.0), engine, ref));
+    gko::test::make_diag_dominant(dense_smtx.get());
+    auto smtx = gko::share(Mtx::create(ref));
+    smtx->copy_from(dense_smtx.get());
+    auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, 3, std::uniform_int_distribution<>(1, 1),
+        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
+        4));
+    auto sx = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, 3, std::uniform_int_distribution<>(1, 1),
+        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
+        4));
+
+    auto d_smtx = gko::share(gko::clone(dpcpp, smtx));
+    auto d_sb = gko::share(gko::clone(dpcpp, sb));
+    auto d_sx = gko::share(gko::clone(dpcpp, sx));
+    auto alpha = gko::initialize<Vec>({2.0}, ref);
+    auto d_alpha = gko::initialize<Vec>({2.0}, dpcpp);
+    auto beta = gko::initialize<Vec>({-1.0}, ref);
+    auto d_beta = gko::initialize<Vec>({-1.0}, dpcpp);
+
+    auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx);
+    auto d_sj = Bj::build().with_max_block_size(1u).on(dpcpp)->generate(d_smtx);
+
+    sj->apply(alpha.get(), sb.get(), beta.get(), sx.get());
+    d_sj->apply(d_alpha.get(), d_sb.get(), d_beta.get(), d_sx.get());
+
+    GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12);
+}
+
+
+TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRef)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99, 5);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, DpcppLinearCombinationApplyToMultipleVectorsEquivalentToRef)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+                    97, 99, 5);
+    auto alpha = gko::initialize<Vec>({2.0}, ref);
+    auto d_alpha = gko::initialize<Vec>({2.0}, dpcpp);
+    auto beta = gko::initialize<Vec>({-1.0}, ref);
+    auto d_beta = gko::initialize<Vec>({-1.0}, dpcpp);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(alpha.get(), b.get(), beta.get(), x.get());
+    d_bj->apply(d_alpha.get(), d_b.get(), d_beta.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, ComputesTheSameConditionNumberAsRef)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = clone(ref, d_bj_factory->generate(mtx));
+
+    for (int i = 0; i < gko::as<Bj>(bj.get())->get_num_blocks(); ++i) {
+        EXPECT_NEAR(bj->get_conditioning()[i], d_bj->get_conditioning()[i],
+                    1e-9);
+    }
+}
+
+
+TEST_F(Jacobi, SelectsTheSamePrecisionsAsRef)
+{
+    initialize_data(
+        {0, 2, 14, 27, 40, 51, 61, 70, 80, 92, 100},
+        {ap, ap, ap, ap, ap, ap, ap, ap, ap, ap},
+        {1e+0, 1e+0, 1e+2, 1e+3, 1e+4, 1e+4, 1e+6, 1e+7, 1e+8, 1e+9}, 13, 97,
+        99, 1, 0.2);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = gko::clone(ref, d_bj_factory->generate(mtx));
+
+    auto bj_prec =
+        bj->get_parameters().storage_optimization.block_wise.get_const_data();
+    auto d_bj_prec =
+        d_bj->get_parameters().storage_optimization.block_wise.get_const_data();
+    for (int i = 0; i < gko::as<Bj>(bj.get())->get_num_blocks(); ++i) {
+        EXPECT_EQ(bj_prec[i], d_bj_prec[i]);
+    }
+}
+
+
+TEST_F(Jacobi, AvoidsPrecisionsThatOverflow)
+{
+    auto mtx = gko::matrix::Csr<>::create(dpcpp);
+    // clang-format off
+    mtx->read(mtx_data::diag({
+        // perfectly conditioned block, small value difference,
+        // can use fp16 (5, 10)
+        {{2.0, 1.0},
+         {1.0, 2.0}},
+        // perfectly conditioned block (scaled orthogonal),
+        // with large value difference, need fp16 (7, 8)
+        {{1e-8, -1e-16},
+         {1e-16,  1e-8}}
+    }));
+    // clang-format on
+
+    auto bj =
+        Bj::build()
+            .with_max_block_size(13u)
+            .with_block_pointers(gko::Array<gko::int32>(dpcpp, {0, 2, 4}))
+            .with_storage_optimization(gko::precision_reduction::autodetect())
+            .with_accuracy(0.1)
+            .on(dpcpp)
+            ->generate(give(mtx));
+
+    // both blocks are in the same group, both need (7, 8)
+    auto h_bj = clone(ref, bj);
+    auto prec =
+        h_bj->get_parameters().storage_optimization.block_wise.get_const_data();
+    EXPECT_EQ(prec[0], gko::precision_reduction(1, 1));
+    ASSERT_EQ(prec[1], gko::precision_reduction(1, 1));
+}
+
+
+TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithFullPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-13);
+}
+
+
+TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithReducedPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-7);
+}
+
+
+TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithCustomReducedPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-6);
+}
+
+
+TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithQuarteredPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-3);
+}
+
+
+TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithCustomQuarteredPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-1);
+}
+
+
+TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-1);
+}
+
+
+TEST_F(Jacobi,
+       DpcppTransposedPreconditionerEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->transpose()),
+                        gko::as<Bj>(bj->transpose()), 1e-14);
+}
+
+
+TEST_F(Jacobi,
+       DpcppConjTransposedPreconditionerEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99);
+
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+    d_bj->copy_from(bj.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj->conj_transpose()),
+                        gko::as<Bj>(bj->conj_transpose()), 1e-14);
+}
+
+
+TEST_F(Jacobi, DpcppApplyEquivalentToRefWithFullPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, DpcppApplyEquivalentToRefWithReducedPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6);
+}
+
+
+TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-5);
+}
+
+
+TEST_F(Jacobi, DpcppApplyEquivalentToRefWithQuarteredPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-2);
+}
+
+
+TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedAndReducedPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {up, up, up, up, up, up, up, up, up, up, up}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-2);
+}
+
+
+TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomQuarteredPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6);
+}
+
+
+TEST_F(Jacobi, DpcppApplyEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-1);
+}
+
+
+TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97,
+                    99);
+    auto alpha = gko::initialize<Vec>({2.0}, ref);
+    auto d_alpha = gko::initialize<Vec>({2.0}, dpcpp);
+    auto beta = gko::initialize<Vec>({-1.0}, ref);
+    auto d_beta = gko::initialize<Vec>({-1.0}, dpcpp);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6);
+}
+
+
+TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithFullPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97,
+                    99, 5);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+}
+
+
+TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithReducedPrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97,
+                    99, 5);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6);
+}
+
+
+TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    99, 5);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-1);
+}
+
+
+TEST_F(
+    Jacobi,
+    DpcppLinearCombinationApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision)
+{
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
+                    {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97,
+                    99, 5);
+    auto alpha = gko::initialize<Vec>({2.0}, ref);
+    auto d_alpha = gko::initialize<Vec>({2.0}, dpcpp);
+    auto beta = gko::initialize<Vec>({-1.0}, ref);
+    auto d_beta = gko::initialize<Vec>({-1.0}, dpcpp);
+    auto bj = bj_factory->generate(mtx);
+    auto d_bj = d_bj_factory->generate(mtx);
+
+    bj->apply(b.get(), x.get());
+    d_bj->apply(d_b.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6);
+}
+
+
+}  // namespace

From 8f25a9af2921667b97d45afd72b2adb3937f6743 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 25 Nov 2021 17:27:30 +0800
Subject: [PATCH 04/11] manual fix and adjust the error bound

---
 .../jacobi_generate_kernel.hpp.inc            |   4 +-
 core/preconditioner/jacobi.cpp                |   3 -
 core/preconditioner/jacobi_utils.hpp          |  20 ++--
 dpcpp/CMakeLists.txt                          |  34 ++++++
 ...cobi_advanced_apply_instantiate.inc.dp.cpp |  10 +-
 .../jacobi_advanced_apply_kernel.dp.cpp       |   9 +-
 .../jacobi_generate_instantiate.inc.dp.cpp    |  18 +--
 .../jacobi_generate_kernel.dp.cpp             |   3 +-
 dpcpp/preconditioner/jacobi_kernels.dp.cpp    |  13 +-
 ...jacobi_simple_apply_instantiate.inc.dp.cpp |   8 +-
 .../jacobi_simple_apply_kernel.dp.cpp         |  12 +-
 dpcpp/test/preconditioner/jacobi_kernels.cpp  | 112 +++++++++++-------
 omp/preconditioner/jacobi_kernels.cpp         |   4 +-
 reference/preconditioner/jacobi_kernels.cpp   |   4 +-
 14 files changed, 160 insertions(+), 94 deletions(-)

diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc
index d48fc65fa11..208bd33d7dc 100644
--- a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc
+++ b/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc
@@ -73,7 +73,9 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility(
     }
 
     return succeeded && block_cond >= 1.0 &&
-           block_cond * float_traits<remove_complex<ValueType>>::eps < 1e-3;
+           block_cond * static_cast<remove_complex<ValueType>>(
+                            float_traits<remove_complex<ValueType>>::eps) <
+               remove_complex<ValueType>{1e-3};
 }
 
 
diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp
index 1125e71d22e..4309edede0c 100644
--- a/core/preconditioner/jacobi.cpp
+++ b/core/preconditioner/jacobi.cpp
@@ -333,11 +333,9 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp* system_matrix,
     } else {
         auto csr_mtx = convert_to_with_sorting<csr_type>(exec, system_matrix,
                                                          skip_sorting);
-
         if (parameters_.block_pointers.get_data() == nullptr) {
             this->detect_blocks(csr_mtx.get());
         }
-
         const auto all_block_opt =
             parameters_.storage_optimization.of_all_blocks;
         auto& precisions = parameters_.storage_optimization.block_wise;
@@ -355,7 +353,6 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp* system_matrix,
             precisions = std::move(tmp);
             conditioning_.resize_and_reset(num_blocks_);
         }
-
         exec->run(jacobi::make_generate(
             csr_mtx.get(), num_blocks_, parameters_.max_block_size,
             parameters_.accuracy, storage_scheme_, conditioning_, precisions,
diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp
index 738defdf54c..302ec84f4ee 100644
--- a/core/preconditioner/jacobi_utils.hpp
+++ b/core/preconditioner/jacobi_utils.hpp
@@ -89,18 +89,12 @@ struct precision_reduction_descriptor {
     static constexpr GKO_ATTRIBUTES uint32
     singleton(const precision_reduction& pr)
     {
-        return pr == precision_reduction(0, 0)
-                   ? p0n0
-                   : pr == precision_reduction(0, 1)
-                         ? p0n1
-                         : pr == precision_reduction(0, 2)
-                               ? p0n2
-                               : pr == precision_reduction(1, 0)
-                                     ? p1n0
-                                     : pr == precision_reduction(1, 1)
-                                           ? p1n1
-                                           : pr == precision_reduction(2, 0)
-                                                 ? p2n0
+        return pr == precision_reduction(0, 0)   ? p0n0
+               : pr == precision_reduction(0, 1) ? p0n1
+               : pr == precision_reduction(0, 2) ? p0n2
+               : pr == precision_reduction(1, 0) ? p1n0
+               : pr == precision_reduction(1, 1) ? p1n1
+               : pr == precision_reduction(2, 0) ? p2n0
                                                  : p0n0;
     }
 };
@@ -141,7 +135,7 @@ GKO_ATTRIBUTES GKO_INLINE uint32 get_supported_storage_reductions(
     using gko::detail::float_traits;
     using type = remove_complex<ValueType>;
     using prd = precision_reduction_descriptor;
-    auto accurate = [&cond, &accuracy](double eps) {
+    auto accurate = [&cond, &accuracy](type eps) {
         return cond * eps < accuracy;
     };
     uint8 is_verified1 = 2;
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 55b8dd87022..dd28a79adeb 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -41,7 +41,10 @@ target_sources(ginkgo_dpcpp
     matrix/sparsity_csr_kernels.dp.cpp
     multigrid/pgm_kernels.dp.cpp
     preconditioner/isai_kernels.dp.cpp
+    preconditioner/jacobi_advanced_apply_kernel.dp.cpp
+    preconditioner/jacobi_generate_kernel.dp.cpp
     preconditioner/jacobi_kernels.dp.cpp
+    preconditioner/jacobi_simple_apply_kernel.dp.cpp
     reorder/rcm_kernels.dp.cpp
     solver/gmres_kernels.dp.cpp
     solver/cb_gmres_kernels.dp.cpp
@@ -54,6 +57,34 @@ target_sources(ginkgo_dpcpp
     ${GKO_UNIFIED_COMMON_SOURCES}
     )
 
+# if(GINKGO_JACOBI_FULL_OPTIMIZATIONS)
+#     set(GKO_DPCPP_JACOBI_BLOCK_SIZES)
+#     foreach(blocksize RANGE 1 32)
+#         list(APPEND GKO_DPCPP_JACOBI_BLOCK_SIZES ${blocksize})
+#     endforeach()
+# else()
+    set(GKO_DPCPP_JACOBI_BLOCK_SIZES 32)
+# endif()
+set(GKO_DPCPP_JACOBI_SOURCES)
+foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_DPCPP_JACOBI_BLOCK_SIZES)
+    configure_file(
+        preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+        preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp)
+    configure_file(
+        preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
+        preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp)
+    configure_file(
+        preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
+        preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp)
+    list(APPEND GKO_DPCPP_JACOBI_SOURCES
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp)
+endforeach()
+target_sources(ginkgo_dpcpp PRIVATE ${GKO_DPCPP_JACOBI_SOURCES})
+string(REPLACE ";" "," GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE "${GKO_DPCPP_JACOBI_BLOCK_SIZES}")
+configure_file(preconditioner/jacobi_common.hpp.in preconditioner/jacobi_common.hpp)
+
 ginkgo_compile_features(ginkgo_dpcpp)
 target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0)
 
@@ -72,6 +103,9 @@ if (BUILD_SHARED_LIBS)
 else ()
     target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel)
 endif()
+# include path for generated headers like jacobi_common.hpp
+target_include_directories(ginkgo_dpcpp
+    PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..)
 target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
 target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL)
 if (GINKGO_DPCPP_SINGLE_MODE)
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
index 4425f8e277b..ffaef7e7650 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
@@ -63,8 +63,6 @@ namespace dpcpp {
  * @ingroup jacobi
  */
 namespace jacobi {
-
-
 namespace kernel {
 
 
@@ -179,7 +177,7 @@ void advanced_adaptive_apply(
 
 
 // clang-format off
-//#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
+#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
 // clang-format on
 // make things easier for IDEs
 #ifndef GKO_JACOBI_BLOCK_SIZE
@@ -190,7 +188,8 @@ void advanced_adaptive_apply(
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
 void advanced_apply(
-    syn::value_list<int, max_block_size>, size_type num_blocks,
+    syn::value_list<int, max_block_size>,
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
     const precision_reduction* block_precisions,
     const IndexType* block_pointers, const ValueType* blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -221,7 +220,8 @@ void advanced_apply(
 #define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType)   \
     void advanced_apply<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE, \
                         ValueType, IndexType>(                              \
-        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>, size_type,             \
+        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>,                        \
+        std::shared_ptr<const DefaultExecutor>, size_type,                  \
         const precision_reduction*, const IndexType* block_pointers,        \
         const ValueType*,                                                   \
         const preconditioner::block_interleaved_storage_scheme<IndexType>&, \
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
index 1db62f49db3..d90342f7575 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
@@ -57,7 +57,8 @@ namespace jacobi {
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
 void advanced_apply(
-    syn::value_list<int, max_block_size>, size_type num_blocks,
+    syn::value_list<int, max_block_size>,
+    std::shared_ptr<const DpcppExecutor> exec, size_type num_blocks,
     const precision_reduction* block_precisions,
     const IndexType* block_pointers, const ValueType* blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -89,9 +90,9 @@ void apply(std::shared_ptr<const DpcppExecutor> exec, size_type num_blocks,
                 return max_block_size <= compiled_block_size;
             },
             syn::value_list<int, config::min_warps_per_block>(),
-            syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
-            block_pointers.get_const_data(), blocks.get_const_data(),
-            storage_scheme, alpha->get_const_values(),
+            syn::type_list<>(), exec, num_blocks,
+            block_precisions.get_const_data(), block_pointers.get_const_data(),
+            blocks.get_const_data(), storage_scheme, alpha->get_const_values(),
             b->get_const_values() + col, b->get_stride(), x->get_values() + col,
             x->get_stride());
     }
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index 2a759b743d8..5d27a02c65b 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -33,7 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <dpcpp/components/diagonal_block_manipulation.dp.hpp>
 #include <dpcpp/preconditioner/jacobi_common.hpp>
 
 
@@ -53,6 +52,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/math.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/diagonal_block_manipulation.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/components/uninitialized_array.hpp"
 #include "dpcpp/components/warp_blas.dp.hpp"
@@ -112,7 +112,9 @@ __dpct_inline__ bool validate_precision_reduction_feasibility(
     }
 
     return succeeded && block_cond >= 1.0 &&
-           block_cond * float_traits<remove_complex<ValueType>>::eps < 1e-3;
+           block_cond * static_cast<remove_complex<ValueType>>(
+                            float_traits<remove_complex<ValueType>>::eps) <
+               remove_complex<ValueType>{1e-3};
 }
 
 
@@ -135,7 +137,7 @@ void generate(
     csr::extract_transposed_diag_blocks<max_block_size, warps_per_block>(
         block, config::warp_size / subwarp_size, row_ptrs, col_idxs, values,
         block_ptrs, num_blocks, row, 1,
-        *workspace + item_ct1.get_local_id(0) * max_block_size);
+        *workspace + item_ct1.get_local_id(0) * max_block_size, item_ct1);
     const auto subwarp = group::tiled_partition<subwarp_size>(block);
     if (block_id < num_blocks) {
         const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
@@ -199,7 +201,7 @@ void adaptive_generate(
     csr::extract_transposed_diag_blocks<max_block_size, warps_per_block>(
         block, config::warp_size / subwarp_size, row_ptrs, col_idxs, values,
         block_ptrs, num_blocks, row, 1,
-        *workspace + item_ct1.get_local_id(0) * max_block_size);
+        *workspace + item_ct1.get_local_id(0) * max_block_size, item_ct1);
 
     // compute inverse and figure out the correct precision
     const auto subwarp = group::tiled_partition<subwarp_size>(block);
@@ -251,8 +253,8 @@ void adaptive_generate(
 
     // make sure all blocks in the group have the same precision
     const auto warp = group::tiled_partition<config::warp_size>(block);
-    const auto prec =
-        preconditioner::detail::get_optimal_storage_reduction(reduce(
+    const auto prec = preconditioner::detail::get_optimal_storage_reduction(
+        ::gko::kernels::dpcpp::reduce(
             warp, prec_descriptor, [](uint32 x, uint32 y) { return x & y; }));
 
     // store the block back into memory
@@ -302,7 +304,7 @@ void adaptive_generate(
 
 
 // clang-format off
-//#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
+#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
 // clang-format on
 // make things easier for IDEs
 #ifndef GKO_JACOBI_BLOCK_SIZE
@@ -313,6 +315,7 @@ void adaptive_generate(
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
 void generate(syn::value_list<int, max_block_size>,
+              std::shared_ptr<const DefaultExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* mtx,
               remove_complex<ValueType> accuracy, ValueType* block_data,
               const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -348,6 +351,7 @@ void generate(syn::value_list<int, max_block_size>,
     void generate<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE,        \
                   ValueType, IndexType>(                                     \
         syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>,                         \
+        std::shared_ptr<const DefaultExecutor>,                              \
         const matrix::Csr<ValueType, IndexType>*, remove_complex<ValueType>, \
         ValueType*,                                                          \
         const preconditioner::block_interleaved_storage_scheme<IndexType>&,  \
diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
index 22e3f587748..91e7768fd54 100644
--- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
@@ -58,6 +58,7 @@ namespace jacobi {
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
 void generate(syn::value_list<int, max_block_size>,
+              std::shared_ptr<const DefaultExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* mtx,
               remove_complex<ValueType> accuracy, ValueType* block_data,
               const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -88,7 +89,7 @@ void generate(std::shared_ptr<const DpcppExecutor> exec,
             return max_block_size <= compiled_block_size;
         },
         syn::value_list<int, config::min_warps_per_block>(), syn::type_list<>(),
-        system_matrix, accuracy, blocks.get_data(), storage_scheme,
+        exec, system_matrix, accuracy, blocks.get_data(), storage_scheme,
         conditioning.get_data(), block_precisions.get_data(),
         block_pointers.get_const_data(), num_blocks);
 }
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index 71c8101a8ee..105f1c6f96a 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -65,7 +65,7 @@ namespace {
 
 
 // a total of 32 warps (1024 threads)
-constexpr int default_num_warps = 32;
+constexpr int default_num_warps = 16;
 // with current architectures, at most 32 warps can be scheduled per SM (and
 // current GPUs have at most 84 SMs)
 constexpr int default_grid_size = 32 * 32 * 128;
@@ -266,10 +266,6 @@ void transpose_jacobi(
     if (rank < block_size) {
         for (IndexType i = 0; i < block_size; ++i) {
             auto val = blocks[block_ofs + i * block_stride + rank];
-            /*
-            DPCT1007:4: Migration of this CUDA API is not supported by the
-            Intel(R) DPC++ Compatibility Tool.
-            */
             out_blocks[block_ofs + i + rank * block_stride] =
                 conjugate ? conj(val) : val;
         }
@@ -437,7 +433,8 @@ namespace {
 template <bool conjugate, int warps_per_block, int max_block_size,
           typename ValueType, typename IndexType>
 void transpose_jacobi(
-    syn::value_list<int, max_block_size>, size_type num_blocks,
+    syn::value_list<int, max_block_size>,
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
     const precision_reduction* block_precisions,
     const IndexType* block_pointers, const ValueType* blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -484,7 +481,7 @@ void transpose_jacobi(
             return max_block_size <= compiled_block_size;
         },
         syn::value_list<int, false, config::min_warps_per_block>(),
-        syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+        syn::type_list<>(), exec, num_blocks, block_precisions.get_const_data(),
         block_pointers.get_const_data(), blocks.get_const_data(),
         storage_scheme, out_blocks.get_data());
 }
@@ -508,7 +505,7 @@ void conj_transpose_jacobi(
             return max_block_size <= compiled_block_size;
         },
         syn::value_list<int, true, config::min_warps_per_block>(),
-        syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
+        syn::type_list<>(), exec, num_blocks, block_precisions.get_const_data(),
         block_pointers.get_const_data(), blocks.get_const_data(),
         storage_scheme, out_blocks.get_data());
 }
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
index 9edeb6f552c..8c3787cf721 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
@@ -174,7 +174,7 @@ void adaptive_apply(
 
 
 // clang-format off
-//#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
+#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@
 // clang-format on
 // make things easier for IDEs
 #ifndef GKO_JACOBI_BLOCK_SIZE
@@ -184,7 +184,8 @@ void adaptive_apply(
 
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
-void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
+void apply(syn::value_list<int, max_block_size>,
+           std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
            const precision_reduction* block_precisions,
            const IndexType* block_pointers, const ValueType* blocks,
            const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -214,7 +215,8 @@ void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
 #define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType)       \
     void apply<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE, ValueType, \
                IndexType>(                                                    \
-        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>, size_type,               \
+        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>,                          \
+        std::shared_ptr<const DefaultExecutor>, size_type,                    \
         const precision_reduction*, const IndexType*, const ValueType*,       \
         const preconditioner::block_interleaved_storage_scheme<IndexType>&,   \
         const ValueType*, size_type, ValueType*, size_type)
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
index d5ee2fbf92a..651a4ba194a 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
@@ -56,7 +56,8 @@ namespace jacobi {
 
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
-void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
+void apply(syn::value_list<int, max_block_size>,
+           std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
            const precision_reduction* block_precisions,
            const IndexType* block_pointers, const ValueType* blocks,
            const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -85,10 +86,11 @@ void simple_apply(
                 return max_block_size <= compiled_block_size;
             },
             syn::value_list<int, config::min_warps_per_block>(),
-            syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
-            block_pointers.get_const_data(), blocks.get_const_data(),
-            storage_scheme, b->get_const_values() + col, b->get_stride(),
-            x->get_values() + col, x->get_stride());
+            syn::type_list<>(), exec, num_blocks,
+            block_precisions.get_const_data(), block_pointers.get_const_data(),
+            blocks.get_const_data(), storage_scheme,
+            b->get_const_values() + col, b->get_stride(), x->get_values() + col,
+            x->get_stride());
     }
 }
 
diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp
index 0302ae7c24e..e5ec9bc604c 100644
--- a/dpcpp/test/preconditioner/jacobi_kernels.cpp
+++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp
@@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
+#include "dpcpp/test/utils.hpp"
 
 
 namespace {
@@ -52,14 +53,19 @@ namespace {
 
 class Jacobi : public ::testing::Test {
 protected:
-    using Bj = gko::preconditioner::Jacobi<>;
-    using Mtx = gko::matrix::Csr<>;
-    using Vec = gko::matrix::Dense<>;
-    using mtx_data = gko::matrix_data<>;
+#if GINKGO_DPCPP_SINGLE_MODE
+    using value_type = float;
+#else
+    using value_type = double;
+#endif
+    using Bj = gko::preconditioner::Jacobi<value_type>;
+    using Mtx = gko::matrix::Csr<value_type>;
+    using Vec = gko::matrix::Dense<value_type>;
+    using mtx_data = gko::matrix_data<value_type>;
 
     void SetUp()
     {
-        ASSERT_GT(gko::DpcppExecutor::get_num_devices(), 0);
+        ASSERT_GT(gko::DpcppExecutor::get_num_devices("all"), 0);
         ref = gko::ReferenceExecutor::create();
         dpcpp = gko::DpcppExecutor::create(0, ref);
     }
@@ -74,16 +80,16 @@ class Jacobi : public ::testing::Test {
     void initialize_data(
         std::initializer_list<gko::int32> block_pointers,
         std::initializer_list<gko::precision_reduction> block_precisions,
-        std::initializer_list<double> condition_numbers,
+        std::initializer_list<value_type> condition_numbers,
         gko::uint32 max_block_size, int min_nnz, int max_nnz, int num_rhs = 1,
-        double accuracy = 0.1, bool skip_sorting = true)
+        value_type accuracy = 0.1, bool skip_sorting = true)
     {
         std::ranlux48 engine(42);
         const auto dim = *(end(block_pointers) - 1);
         if (condition_numbers.size() == 0) {
             mtx = gko::test::generate_random_matrix<Mtx>(
                 dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz),
-                std::normal_distribution<>(0.0, 1.0), engine, ref);
+                std::normal_distribution<value_type>(0.0, 1.0), engine, ref);
         } else {
             std::vector<mtx_data> blocks;
             for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) {
@@ -91,7 +97,8 @@ class Jacobi : public ::testing::Test {
                     begin(block_pointers)[i + 1] - begin(block_pointers)[i];
                 const auto cond = begin(condition_numbers)[i];
                 blocks.push_back(mtx_data::cond(
-                    size, cond, std::normal_distribution<>(-1, 1), engine));
+                    size, cond, std::normal_distribution<value_type>(-1, 1),
+                    engine));
             }
             mtx = Mtx::create(ref);
             mtx->read(mtx_data::diag(begin(blocks), end(blocks)));
@@ -127,11 +134,11 @@ class Jacobi : public ::testing::Test {
         }
         b = gko::test::generate_random_matrix<Vec>(
             dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs),
-            std::normal_distribution<>(0.0, 1.0), engine, ref);
+            std::normal_distribution<value_type>(0.0, 1.0), engine, ref);
         d_b = gko::clone(dpcpp, b);
         x = gko::test::generate_random_matrix<Vec>(
             dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs),
-            std::normal_distribution<>(0.0, 1.0), engine, ref);
+            std::normal_distribution<value_type>(0.0, 1.0), engine, ref);
         d_x = gko::clone(dpcpp, x);
     }
 
@@ -222,7 +229,7 @@ TEST_F(Jacobi, DpcppFindNaturalBlocksInLargeMatrixEquivalentToRef)
         1       1
         1       1
      */
-    using data = gko::matrix_data<double, int>;
+    using data = gko::matrix_data<value_type, int>;
     auto mtx = share(Mtx::create(ref));
     mtx->read(data::diag({550, 550}, {{1.0, 1.0, 0.0, 0.0, 0.0, 0.0},
                                       {1.0, 1.0, 0.0, 0.0, 0.0, 0.0},
@@ -249,7 +256,7 @@ TEST_F(Jacobi,
                 1   1
                         1
      */
-    using data = gko::matrix_data<double, int>;
+    using data = gko::matrix_data<value_type, int>;
     auto mtx = share(Mtx::create(ref));
     mtx->read(data::diag({550, 550}, {{1.0, 1.0, 0.0, 0.0, 0.0},
                                       {1.0, 1.0, 0.0, 0.0, 0.0},
@@ -275,7 +282,7 @@ TEST_F(Jacobi,
         1       1   1
                 1        1
      */
-    using data = gko::matrix_data<double, int>;
+    using data = gko::matrix_data<value_type, int>;
     auto mtx = share(Mtx::create(ref));
     mtx->read({{50, 50},
                {{1.0, 1.0, 0.0, 1.0, 0.0},
@@ -300,7 +307,8 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Sorted)
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
 
-    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()),
+                        50 * r<value_type>::value);
 }
 
 
@@ -313,7 +321,8 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Unsorted)
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
 
-    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()),
+                        50 * r<value_type>::value);
 }
 
 
@@ -325,7 +334,8 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithDifferentBlockSize)
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
 
-    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()),
+                        100 * r<value_type>::value);
 }
 
 
@@ -337,7 +347,8 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithMPW)
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
 
-    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
+    GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()),
+                        100 * r<value_type>::value);
 }
 
 
@@ -378,7 +389,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithBlockSize32)
     bj->apply(b.get(), x.get());
     d_bj->apply(d_b.get(), d_x.get());
 
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 100 * r<value_type>::value);
 }
 
 
@@ -392,7 +403,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithDifferentBlockSize)
     bj->apply(b.get(), x.get());
     d_bj->apply(d_b.get(), d_x.get());
 
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 100 * r<value_type>::value);
 }
 
 
@@ -406,7 +417,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRef)
     bj->apply(b.get(), x.get());
     d_bj->apply(d_b.get(), d_x.get());
 
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 100 * r<value_type>::value);
 }
 
 
@@ -416,13 +427,13 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef)
     std::ranlux48 engine(42);
     auto dense_smtx = gko::share(gko::test::generate_random_matrix<Vec>(
         dim, dim, std::uniform_int_distribution<>(1, dim),
-        std::normal_distribution<>(1.0, 2.0), engine, ref));
+        std::normal_distribution<value_type>(1.0, 2.0), engine, ref));
     gko::test::make_diag_dominant(dense_smtx.get());
     auto smtx = gko::share(Mtx::create(ref));
     smtx->copy_from(dense_smtx.get());
     auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
         dim, 3, std::uniform_int_distribution<>(1, 1),
-        std::normal_distribution<>(0.0, 1.0), engine, ref));
+        std::normal_distribution<value_type>(0.0, 1.0), engine, ref));
     auto sx = Vec::create(ref, sb->get_size());
 
     auto d_smtx = gko::share(Mtx::create(dpcpp));
@@ -437,7 +448,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef)
     sj->apply(sb.get(), sx.get());
     d_sj->apply(d_sb.get(), d_sx.get());
 
-    GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12);
+    GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 100 * r<value_type>::value);
 }
 
 
@@ -455,7 +466,7 @@ TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRef)
     bj->apply(alpha.get(), b.get(), beta.get(), x.get());
     d_bj->apply(d_alpha.get(), d_b.get(), d_beta.get(), d_x.get());
 
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 100 * r<value_type>::value);
 }
 
 
@@ -465,18 +476,18 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef)
     std::ranlux48 engine(42);
     auto dense_smtx = gko::share(gko::test::generate_random_matrix<Vec>(
         dim, dim, std::uniform_int_distribution<>(1, dim),
-        std::normal_distribution<>(1.0, 2.0), engine, ref));
+        std::normal_distribution<value_type>(1.0, 2.0), engine, ref));
     gko::test::make_diag_dominant(dense_smtx.get());
     auto smtx = gko::share(Mtx::create(ref));
     smtx->copy_from(dense_smtx.get());
     auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
         dim, 3, std::uniform_int_distribution<>(1, 1),
-        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
-        4));
+        std::normal_distribution<value_type>(0.0, 1.0), engine, ref,
+        gko::dim<2>(dim, 3), 4));
     auto sx = gko::share(gko::test::generate_random_matrix<Vec>(
         dim, 3, std::uniform_int_distribution<>(1, 1),
-        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
-        4));
+        std::normal_distribution<value_type>(0.0, 1.0), engine, ref,
+        gko::dim<2>(dim, 3), 4));
 
     auto d_smtx = gko::share(gko::clone(dpcpp, smtx));
     auto d_sb = gko::share(gko::clone(dpcpp, sb));
@@ -492,7 +503,7 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef)
     sj->apply(alpha.get(), sb.get(), beta.get(), sx.get());
     d_sj->apply(d_alpha.get(), d_sb.get(), d_beta.get(), d_sx.get());
 
-    GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12);
+    GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 100 * r<value_type>::value);
 }
 
 
@@ -506,7 +517,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRef)
     bj->apply(b.get(), x.get());
     d_bj->apply(d_b.get(), d_x.get());
 
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 100 * r<value_type>::value);
 }
 
 
@@ -524,12 +535,13 @@ TEST_F(Jacobi, DpcppLinearCombinationApplyToMultipleVectorsEquivalentToRef)
     bj->apply(alpha.get(), b.get(), beta.get(), x.get());
     d_bj->apply(d_alpha.get(), d_b.get(), d_beta.get(), d_x.get());
 
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 100 * r<value_type>::value);
 }
 
 
 TEST_F(Jacobi, ComputesTheSameConditionNumberAsRef)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
                     {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99);
 
@@ -548,7 +560,7 @@ TEST_F(Jacobi, SelectsTheSamePrecisionsAsRef)
     initialize_data(
         {0, 2, 14, 27, 40, 51, 61, 70, 80, 92, 100},
         {ap, ap, ap, ap, ap, ap, ap, ap, ap, ap},
-        {1e+0, 1e+0, 1e+2, 1e+3, 1e+4, 1e+4, 1e+6, 1e+7, 1e+8, 1e+9}, 13, 97,
+        {1e+0, 1e+0, 1e+2, 1e+3, 1e+4, 1e+4, 1e+6, 1e+7, 1e+8, 1e+9}, 32, 97,
         99, 1, 0.2);
 
     auto bj = bj_factory->generate(mtx);
@@ -566,7 +578,7 @@ TEST_F(Jacobi, SelectsTheSamePrecisionsAsRef)
 
 TEST_F(Jacobi, AvoidsPrecisionsThatOverflow)
 {
-    auto mtx = gko::matrix::Csr<>::create(dpcpp);
+    auto mtx = gko::matrix::Csr<value_type>::create(dpcpp);
     // clang-format off
     mtx->read(mtx_data::diag({
         // perfectly conditioned block, small value difference,
@@ -582,26 +594,31 @@ TEST_F(Jacobi, AvoidsPrecisionsThatOverflow)
 
     auto bj =
         Bj::build()
-            .with_max_block_size(13u)
+            .with_max_block_size(32u)
             .with_block_pointers(gko::Array<gko::int32>(dpcpp, {0, 2, 4}))
             .with_storage_optimization(gko::precision_reduction::autodetect())
-            .with_accuracy(0.1)
+            .with_accuracy(value_type{0.1})
             .on(dpcpp)
             ->generate(give(mtx));
 
-    // both blocks are in the same group, both need (7, 8)
     auto h_bj = clone(ref, bj);
     auto prec =
         h_bj->get_parameters().storage_optimization.block_wise.get_const_data();
-    EXPECT_EQ(prec[0], gko::precision_reduction(1, 1));
-    ASSERT_EQ(prec[1], gko::precision_reduction(1, 1));
+    EXPECT_EQ(prec[0], gko::precision_reduction(0, 2));
+// 2 - 0 is same as 1 - 1 when it is float
+#if GINKGO_DPCPP_SINGLE_MODE
+    ASSERT_EQ(prec[1], gko::precision_reduction(2, 0));
+#else
+    ASSERT_EQ(prec[1], gko::precision_reduction(2, 0));
+#endif
 }
 
 
 TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithFullPrecision)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99);
+                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 32, 97, 99);
 
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -619,7 +636,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithReducedPrecision)
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
 
-    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-7);
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 2e-7);
 }
 
 
@@ -664,6 +681,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithCustomQuarteredPrecision)
 
 TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithAdaptivePrecision)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
                     {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
                     99);
@@ -678,6 +696,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithAdaptivePrecision)
 TEST_F(Jacobi,
        DpcppTransposedPreconditionerEquivalentToRefWithAdaptivePrecision)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
                     {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
                     99);
@@ -694,6 +713,7 @@ TEST_F(Jacobi,
 TEST_F(Jacobi,
        DpcppConjTransposedPreconditionerEquivalentToRefWithAdaptivePrecision)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
                     {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
                     99);
@@ -709,6 +729,7 @@ TEST_F(Jacobi,
 
 TEST_F(Jacobi, DpcppApplyEquivalentToRefWithFullPrecision)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
                     {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97,
                     99);
@@ -724,6 +745,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithFullPrecision)
 
 TEST_F(Jacobi, DpcppApplyEquivalentToRefWithReducedPrecision)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
                     {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97,
                     99);
@@ -739,6 +761,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithReducedPrecision)
 
 TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedPrecision)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
                     {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97,
                     99);
@@ -799,6 +822,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomQuarteredPrecision)
 
 TEST_F(Jacobi, DpcppApplyEquivalentToRefWithAdaptivePrecision)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
                     {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
                     99);
@@ -814,6 +838,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithAdaptivePrecision)
 
 TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRefWithAdaptivePrecision)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
                     {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97,
                     99);
@@ -833,6 +858,7 @@ TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRefWithAdaptivePrecision)
 
 TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithFullPrecision)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
                     {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97,
                     99, 5);
@@ -863,6 +889,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithReducedPrecision)
 
 TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
                     {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
                     99, 5);
@@ -880,6 +907,7 @@ TEST_F(
     Jacobi,
     DpcppLinearCombinationApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision)
 {
+    SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
                     {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97,
                     99, 5);
diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp
index 074168047d3..2966a70eb78 100644
--- a/omp/preconditioner/jacobi_kernels.cpp
+++ b/omp/preconditioner/jacobi_kernels.cpp
@@ -348,7 +348,9 @@ inline bool validate_precision_reduction_feasibility(IndexType block_size,
     }
     cond *= compute_inf_norm(block_size, block_size, tmp_buffer, block_size);
     return cond >= 1.0 &&
-           cond * float_traits<remove_complex<ValueType>>::eps < 1e-3;
+           cond * static_cast<remove_complex<ValueType>>(
+                      float_traits<remove_complex<ValueType>>::eps) <
+               remove_complex<ValueType>{1e-3};
 }
 
 
diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp
index a2a63a0e896..ccbe1653990 100644
--- a/reference/preconditioner/jacobi_kernels.cpp
+++ b/reference/preconditioner/jacobi_kernels.cpp
@@ -332,7 +332,9 @@ inline bool validate_precision_reduction_feasibility(
     }
     cond *= compute_inf_norm(block_size, block_size, tmp.data(), block_size);
     return cond >= 1.0 &&
-           cond * float_traits<remove_complex<ValueType>>::eps < 1e-3;
+           cond * static_cast<remove_complex<ValueType>>(
+                      float_traits<remove_complex<ValueType>>::eps) <
+               remove_complex<ValueType>{1e-3};
 }
 
 

From 3e15d98411741bb61f0b6ba3e45cb5d49545eb16 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 4 Jul 2022 09:47:35 +0200
Subject: [PATCH 05/11] remove dpcpp math

---
 dpcpp/components/warp_blas.dp.hpp                                | 1 -
 .../preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp  | 1 -
 dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp      | 1 -
 dpcpp/preconditioner/jacobi_kernels.dp.cpp                       | 1 -
 dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp  | 1 -
 5 files changed, 5 deletions(-)

diff --git a/dpcpp/components/warp_blas.dp.hpp b/dpcpp/components/warp_blas.dp.hpp
index b31fd0ef4fd..756421c54f8 100644
--- a/dpcpp/components/warp_blas.dp.hpp
+++ b/dpcpp/components/warp_blas.dp.hpp
@@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <cassert>
-// #include <dpcpp/base/math.hpp>
 #include <type_traits>
 
 
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
index ffaef7e7650..9df2ca3741a 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
@@ -48,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
-#include "dpcpp/base/math.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/components/warp_blas.dp.hpp"
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index 5d27a02c65b..a8dcba503f1 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -50,7 +50,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
-#include "dpcpp/base/math.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/diagonal_block_manipulation.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index 105f1c6f96a..15b0ab51085 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -47,7 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
-#include "dpcpp/base/math.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
index 8c3787cf721..92fd39c2f7e 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
@@ -48,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
-#include "dpcpp/base/math.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/components/warp_blas.dp.hpp"

From 56956d4e7cc879b91ebb00f40b8e887b1a7e9db5 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 4 Jul 2022 17:58:59 +0200
Subject: [PATCH 06/11] fix Array and UninitializedArray naming

---
 .../jacobi_advanced_apply_kernel.dp.cpp       |  6 +++---
 .../jacobi_generate_instantiate.inc.dp.cpp    |  8 ++++----
 .../jacobi_generate_kernel.dp.cpp             |  6 +++---
 dpcpp/preconditioner/jacobi_kernels.dp.cpp    | 20 +++++++++----------
 .../jacobi_simple_apply_kernel.dp.cpp         |  4 ++--
 dpcpp/test/preconditioner/jacobi_kernels.cpp  |  6 +++---
 6 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
index d90342f7575..ece82241ac8 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
@@ -74,9 +74,9 @@ void apply(std::shared_ptr<const DpcppExecutor> exec, size_type num_blocks,
            uint32 max_block_size,
            const preconditioner::block_interleaved_storage_scheme<IndexType>&
                storage_scheme,
-           const Array<precision_reduction>& block_precisions,
-           const Array<IndexType>& block_pointers,
-           const Array<ValueType>& blocks,
+           const array<precision_reduction>& block_precisions,
+           const array<IndexType>& block_pointers,
+           const array<ValueType>& blocks,
            const matrix::Dense<ValueType>* alpha,
            const matrix::Dense<ValueType>* b,
            const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* x)
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index a8dcba503f1..4a0c03432aa 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -126,7 +126,7 @@ void generate(
     preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
     const IndexType* __restrict__ block_ptrs, size_type num_blocks,
     sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, max_block_size * warps_per_block>* workspace)
+    uninitialized_array<ValueType, max_block_size * warps_per_block>* workspace)
 {
     const auto block_id =
         thread::get_subwarp_id<subwarp_size, warps_per_block>(item_ct1);
@@ -162,7 +162,7 @@ void generate(
 {
     queue->submit([&](sycl::handler& cgh) {
         sycl::accessor<
-            UninitializedArray<ValueType, max_block_size * warps_per_block>, 0,
+            uninitialized_array<ValueType, max_block_size * warps_per_block>, 0,
             sycl::access_mode::read_write, sycl::access::target::local>
             workspace_acc_ct1(cgh);
 
@@ -189,7 +189,7 @@ void adaptive_generate(
     precision_reduction* __restrict__ block_precisions,
     const IndexType* __restrict__ block_ptrs, size_type num_blocks,
     sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, max_block_size * warps_per_block>* workspace)
+    uninitialized_array<ValueType, max_block_size * warps_per_block>* workspace)
 {
     // extract blocks
     const auto block_id =
@@ -284,7 +284,7 @@ void adaptive_generate(
 {
     queue->submit([&](sycl::handler& cgh) {
         sycl::accessor<
-            UninitializedArray<ValueType, max_block_size * warps_per_block>, 0,
+            uninitialized_array<ValueType, max_block_size * warps_per_block>, 0,
             sycl::access_mode::read_write, sycl::access::target::local>
             workspace_acc_ct1(cgh);
 
diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
index 91e7768fd54..fdd97a50749 100644
--- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
@@ -77,9 +77,9 @@ void generate(std::shared_ptr<const DpcppExecutor> exec,
               remove_complex<ValueType> accuracy,
               const preconditioner::block_interleaved_storage_scheme<IndexType>&
                   storage_scheme,
-              Array<remove_complex<ValueType>>& conditioning,
-              Array<precision_reduction>& block_precisions,
-              const Array<IndexType>& block_pointers, Array<ValueType>& blocks)
+              array<remove_complex<ValueType>>& conditioning,
+              array<precision_reduction>& block_precisions,
+              const array<IndexType>& block_pointers, array<ValueType>& blocks)
 {
     components::fill_array(exec, blocks.get_data(), blocks.get_num_elems(),
                            zero<ValueType>());
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index 15b0ab51085..a6e35da67db 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -357,9 +357,9 @@ size_type find_natural_blocks(std::shared_ptr<const DefaultExecutor> exec,
                               int32 max_block_size,
                               IndexType* __restrict__ block_ptrs)
 {
-    Array<size_type> nums(exec, 1);
+    array<size_type> nums(exec, 1);
 
-    Array<bool> matching_next_row(exec, mtx->get_size()[0] - 1);
+    array<bool> matching_next_row(exec, mtx->get_size()[0] - 1);
 
     const dim3 block_size(config::warp_size, 1, 1);
     const dim3 grid_size(
@@ -381,7 +381,7 @@ inline size_type agglomerate_supervariables(
     std::shared_ptr<const DefaultExecutor> exec, int32 max_block_size,
     size_type num_natural_blocks, IndexType* block_ptrs)
 {
-    Array<size_type> nums(exec, 1);
+    array<size_type> nums(exec, 1);
 
     agglomerate_supervariables_kernel(1, 1, 0, exec->get_queue(),
                                       max_block_size, num_natural_blocks,
@@ -396,8 +396,8 @@ inline size_type agglomerate_supervariables(
 
 
 void initialize_precisions(std::shared_ptr<const DefaultExecutor> exec,
-                           const Array<precision_reduction>& source,
-                           Array<precision_reduction>& precisions)
+                           const array<precision_reduction>& source,
+                           array<precision_reduction>& precisions)
 {
     const auto block_size = default_num_warps * config::warp_size;
     const auto grid_size = min(
@@ -414,7 +414,7 @@ template <typename ValueType, typename IndexType>
 void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
                  const matrix::Csr<ValueType, IndexType>* system_matrix,
                  uint32 max_block_size, size_type& num_blocks,
-                 Array<IndexType>& block_pointers)
+                 array<IndexType>& block_pointers)
 {
     auto num_natural_blocks = find_natural_blocks(
         exec, system_matrix, max_block_size, block_pointers.get_data());
@@ -472,7 +472,7 @@ void transpose_jacobi(
     const array<IndexType>& block_pointers, const array<ValueType>& blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
         storage_scheme,
-    Array<ValueType>& out_blocks)
+    array<ValueType>& out_blocks)
 {
     select_transpose_jacobi(
         compiled_kernels(),
@@ -496,7 +496,7 @@ void conj_transpose_jacobi(
     const array<IndexType>& block_pointers, const array<ValueType>& blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
         storage_scheme,
-    Array<ValueType>& out_blocks)
+    array<ValueType>& out_blocks)
 {
     select_transpose_jacobi(
         compiled_kernels(),
@@ -516,8 +516,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void convert_to_dense(
     std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
-    const Array<precision_reduction>& block_precisions,
-    const Array<IndexType>& block_pointers, const Array<ValueType>& blocks,
+    const array<precision_reduction>& block_precisions,
+    const array<IndexType>& block_pointers, const array<ValueType>& blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
         storage_scheme,
     ValueType* result_values, size_type result_stride) GKO_NOT_IMPLEMENTED;
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
index 651a4ba194a..ff3be086a91 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
@@ -74,8 +74,8 @@ void simple_apply(
     uint32 max_block_size,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
         storage_scheme,
-    const Array<precision_reduction>& block_precisions,
-    const Array<IndexType>& block_pointers, const Array<ValueType>& blocks,
+    const array<precision_reduction>& block_precisions,
+    const array<IndexType>& block_pointers, const array<ValueType>& blocks,
     const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* x)
 {
     // TODO: write a special kernel for multiple RHS
diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp
index e5ec9bc604c..8894b26a346 100644
--- a/dpcpp/test/preconditioner/jacobi_kernels.cpp
+++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp
@@ -103,8 +103,8 @@ class Jacobi : public ::testing::Test {
             mtx = Mtx::create(ref);
             mtx->read(mtx_data::diag(begin(blocks), end(blocks)));
         }
-        gko::Array<gko::int32> block_ptrs(ref, block_pointers);
-        gko::Array<gko::precision_reduction> block_prec(ref, block_precisions);
+        gko::array<gko::int32> block_ptrs(ref, block_pointers);
+        gko::array<gko::precision_reduction> block_prec(ref, block_precisions);
         if (block_prec.get_num_elems() == 0) {
             bj_factory = Bj::build()
                              .with_max_block_size(max_block_size)
@@ -595,7 +595,7 @@ TEST_F(Jacobi, AvoidsPrecisionsThatOverflow)
     auto bj =
         Bj::build()
             .with_max_block_size(32u)
-            .with_block_pointers(gko::Array<gko::int32>(dpcpp, {0, 2, 4}))
+            .with_block_pointers(gko::array<gko::int32>(dpcpp, {0, 2, 4}))
             .with_storage_optimization(gko::precision_reduction::autodetect())
             .with_accuracy(value_type{0.1})
             .on(dpcpp)

From f029c517b403e4b72c601db97953b9f6529905f5 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 11 Jul 2022 16:23:26 +0200
Subject: [PATCH 07/11] update subgroup setting

---
 .../diagonal_block_manipulation.dp.hpp        |  2 +-
 ...cobi_advanced_apply_instantiate.inc.dp.cpp | 19 ++++----
 .../jacobi_advanced_apply_kernel.dp.cpp       |  2 +-
 .../jacobi_generate_instantiate.inc.dp.cpp    | 24 ++++++----
 .../jacobi_generate_kernel.dp.cpp             |  2 +-
 dpcpp/preconditioner/jacobi_kernels.dp.cpp    | 14 +++---
 ...jacobi_simple_apply_instantiate.inc.dp.cpp |  8 ++--
 .../jacobi_simple_apply_kernel.dp.cpp         |  2 +-
 dpcpp/test/preconditioner/jacobi_kernels.cpp  | 46 +++++++++++--------
 9 files changed, 70 insertions(+), 49 deletions(-)

diff --git a/dpcpp/components/diagonal_block_manipulation.dp.hpp b/dpcpp/components/diagonal_block_manipulation.dp.hpp
index 32e3e26739c..4d684f2c357 100644
--- a/dpcpp/components/diagonal_block_manipulation.dp.hpp
+++ b/dpcpp/components/diagonal_block_manipulation.dp.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
index 9df2ca3741a..2235d886234 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -107,7 +107,8 @@ void advanced_apply(
     const ValueType* b, int32 b_stride, ValueType* x, int32 x_stride)
 {
     queue->parallel_for(
-        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+        sycl_nd_range(grid, block), [=
+    ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] {
             advanced_apply<max_block_size, subwarp_size, warps_per_block>(
                 blocks, storage_scheme, block_ptrs, num_blocks, alpha, b,
                 b_stride, x, x_stride, item_ct1);
@@ -163,12 +164,14 @@ void advanced_adaptive_apply(
     size_type num_blocks, const ValueType* alpha, const ValueType* b,
     int32 b_stride, ValueType* x, int32 x_stride)
 {
-    queue->parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3>
-                                                            item_ct1) {
-        advanced_adaptive_apply<max_block_size, subwarp_size, warps_per_block>(
-            blocks, storage_scheme, block_precisions, block_ptrs, num_blocks,
-            alpha, b, b_stride, x, x_stride, item_ct1);
-    });
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=
+    ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] {
+            advanced_adaptive_apply<max_block_size, subwarp_size,
+                                    warps_per_block>(
+                blocks, storage_scheme, block_precisions, block_ptrs,
+                num_blocks, alpha, b, b_stride, x, x_stride, item_ct1);
+        });
 }
 
 
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
index ece82241ac8..93565a3f16f 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index 4a0c03432aa..eab5ec118e4 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -167,7 +167,9 @@ void generate(
             workspace_acc_ct1(cgh);
 
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            subwarp_size)]] {
                 generate<max_block_size, subwarp_size, warps_per_block>(
                     num_rows, row_ptrs, col_idxs, values, block_data,
                     storage_scheme, block_ptrs, num_blocks, item_ct1,
@@ -288,13 +290,17 @@ void adaptive_generate(
             sycl::access_mode::read_write, sycl::access::target::local>
             workspace_acc_ct1(cgh);
 
-        cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3>
-                                                             item_ct1) {
-            adaptive_generate<max_block_size, subwarp_size, warps_per_block>(
-                num_rows, row_ptrs, col_idxs, values, accuracy, block_data,
-                storage_scheme, conditioning, block_precisions, block_ptrs,
-                num_blocks, item_ct1, workspace_acc_ct1.get_pointer().get());
-        });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            subwarp_size)]] {
+                adaptive_generate<max_block_size, subwarp_size,
+                                  warps_per_block>(
+                    num_rows, row_ptrs, col_idxs, values, accuracy, block_data,
+                    storage_scheme, conditioning, block_precisions, block_ptrs,
+                    num_blocks, item_ct1,
+                    workspace_acc_ct1.get_pointer().get());
+            });
     });
 }
 
diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
index fdd97a50749..e580719a4b8 100644
--- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index a6e35da67db..8095fe9fe24 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -152,7 +152,9 @@ void compare_adjacent_rows(dim3 grid, dim3 block,
                            bool* matching_next_row)
 {
     queue->parallel_for(
-        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+        sycl_nd_range(grid, block), [=
+    ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                        config::warp_size)]] {
             compare_adjacent_rows(num_rows, max_block_size, row_ptrs, col_idx,
                                   matching_next_row, item_ct1);
         });
@@ -280,7 +282,8 @@ void transpose_jacobi(
     const IndexType* block_ptrs, size_type num_blocks, ValueType* out_blocks)
 {
     queue->parallel_for(
-        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+        sycl_nd_range(grid, block), [=
+    ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] {
             transpose_jacobi<conjugate, max_block_size, subwarp_size,
                              warps_per_block>(blocks, storage_scheme,
                                               block_ptrs, num_blocks,
@@ -310,10 +313,6 @@ void adaptive_transpose_jacobi(
     const auto block_stride = storage_scheme.get_stride();
     const auto rank = subwarp.thread_rank();
     if (rank < block_size) {
-        /*
-        DPCT1007:6: Migration of this CUDA API is not supported by the Intel(R)
-        DPC++ Compatibility Tool.
-        */
         GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
             ValueType, block_precisions[block_id],
             auto local_block =
@@ -342,7 +341,8 @@ void adaptive_transpose_jacobi(
     size_type num_blocks, ValueType* out_blocks)
 {
     queue->parallel_for(
-        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+        sycl_nd_range(grid, block), [=
+    ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] {
             adaptive_transpose_jacobi<conjugate, max_block_size, subwarp_size,
                                       warps_per_block>(
                 blocks, storage_scheme, block_precisions, block_ptrs,
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
index 92fd39c2f7e..f7c7aa2573b 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -107,7 +107,8 @@ void apply(
     int32 b_stride, ValueType* x, int32 x_stride)
 {
     queue->parallel_for(
-        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+        sycl_nd_range(grid, block), [=
+    ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] {
             apply<max_block_size, subwarp_size, warps_per_block>(
                 blocks, storage_scheme, block_ptrs, num_blocks, b, b_stride, x,
                 x_stride, item_ct1);
@@ -161,7 +162,8 @@ void adaptive_apply(
     int32 x_stride)
 {
     queue->parallel_for(
-        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+        sycl_nd_range(grid, block), [=
+    ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] {
             adaptive_apply<max_block_size, subwarp_size, warps_per_block>(
                 blocks, storage_scheme, block_precisions, block_ptrs,
                 num_blocks, b, b_stride, x, x_stride, item_ct1);
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
index ff3be086a91..37519d96779 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp
index 8894b26a346..617a7b8b31c 100644
--- a/dpcpp/test/preconditioner/jacobi_kernels.cpp
+++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp
@@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
+#include "core/utils/matrix_utils.hpp"
 #include "dpcpp/test/utils.hpp"
 
 
@@ -53,6 +54,7 @@ namespace {
 
 class Jacobi : public ::testing::Test {
 protected:
+    using index_type = int32_t;
 #if GINKGO_DPCPP_SINGLE_MODE
     using value_type = float;
 #else
@@ -308,13 +310,13 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Sorted)
     auto d_bj = d_bj_factory->generate(mtx);
 
     GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()),
-                        50 * r<value_type>::value);
+                        100 * r<value_type>::value);
 }
 
 
 TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Unsorted)
 {
-    std::ranlux48 engine(42);
+    std::default_random_engine engine(42);
     initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 110, 1, 0.1, false);
     gko::test::unsort_matrix(mtx.get(), engine);
 
@@ -322,7 +324,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Unsorted)
     auto d_bj = d_bj_factory->generate(mtx);
 
     GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()),
-                        50 * r<value_type>::value);
+                        100 * r<value_type>::value);
 }
 
 
@@ -424,11 +426,14 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRef)
 TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef)
 {
     gko::size_type dim = 313;
-    std::ranlux48 engine(42);
-    auto dense_smtx = gko::share(gko::test::generate_random_matrix<Vec>(
-        dim, dim, std::uniform_int_distribution<>(1, dim),
-        std::normal_distribution<value_type>(1.0, 2.0), engine, ref));
-    gko::test::make_diag_dominant(dense_smtx.get());
+    std::default_random_engine engine(42);
+    auto dense_data =
+        gko::test::generate_random_matrix_data<value_type, index_type>(
+            dim, dim, std::uniform_int_distribution<>(1, dim),
+            std::normal_distribution<>(1.0, 2.0), engine);
+    gko::utils::make_diag_dominant(dense_data);
+    auto dense_smtx = gko::share(Vec::create(ref));
+    dense_smtx->read(dense_data);
     auto smtx = gko::share(Mtx::create(ref));
     smtx->copy_from(dense_smtx.get());
     auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
@@ -473,11 +478,14 @@ TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRef)
 TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef)
 {
     gko::size_type dim = 313;
-    std::ranlux48 engine(42);
-    auto dense_smtx = gko::share(gko::test::generate_random_matrix<Vec>(
-        dim, dim, std::uniform_int_distribution<>(1, dim),
-        std::normal_distribution<value_type>(1.0, 2.0), engine, ref));
-    gko::test::make_diag_dominant(dense_smtx.get());
+    std::default_random_engine engine(42);
+    auto dense_data =
+        gko::test::generate_random_matrix_data<value_type, index_type>(
+            dim, dim, std::uniform_int_distribution<>(1, dim),
+            std::normal_distribution<value_type>(1.0, 2.0), engine);
+    gko::utils::make_diag_dominant(dense_data);
+    auto dense_smtx = gko::share(Vec::create(ref));
+    dense_smtx->read(dense_data);
     auto smtx = gko::share(Mtx::create(ref));
     smtx->copy_from(dense_smtx.get());
     auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
@@ -601,16 +609,18 @@ TEST_F(Jacobi, AvoidsPrecisionsThatOverflow)
             .on(dpcpp)
             ->generate(give(mtx));
 
+    // both blocks are in the same group, both need (7, 8)
     auto h_bj = clone(ref, bj);
     auto prec =
         h_bj->get_parameters().storage_optimization.block_wise.get_const_data();
-    EXPECT_EQ(prec[0], gko::precision_reduction(0, 2));
-// 2 - 0 is same as 1 - 1 when it is float
+    // 2 - 0 is same as 1 - 1 when it is float
 #if GINKGO_DPCPP_SINGLE_MODE
+    EXPECT_EQ(prec[0], gko::precision_reduction(2, 0));
     ASSERT_EQ(prec[1], gko::precision_reduction(2, 0));
 #else
-    ASSERT_EQ(prec[1], gko::precision_reduction(2, 0));
-#endif
+    EXPECT_EQ(prec[0], gko::precision_reduction(1, 1));
+    ASSERT_EQ(prec[1], gko::precision_reduction(1, 1));
+#endif  // GINKGO_DPCPP_SINGLE_MODE
 }
 
 
@@ -636,7 +646,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithReducedPrecision)
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
 
-    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 2e-7);
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-6);
 }
 
 

From 1b0476c433c443d46ac27c78d1a1f2deb7ff6d28 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 14 Jul 2022 16:24:51 +0200
Subject: [PATCH 08/11] adapt the answer with single precision

---
 dpcpp/test/preconditioner/jacobi_kernels.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp
index 617a7b8b31c..a4e31d11d71 100644
--- a/dpcpp/test/preconditioner/jacobi_kernels.cpp
+++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp
@@ -609,16 +609,16 @@ TEST_F(Jacobi, AvoidsPrecisionsThatOverflow)
             .on(dpcpp)
             ->generate(give(mtx));
 
-    // both blocks are in the same group, both need (7, 8)
+    // dpcpp considers all block separately
     auto h_bj = clone(ref, bj);
     auto prec =
         h_bj->get_parameters().storage_optimization.block_wise.get_const_data();
-    // 2 - 0 is same as 1 - 1 when it is float
 #if GINKGO_DPCPP_SINGLE_MODE
-    EXPECT_EQ(prec[0], gko::precision_reduction(2, 0));
+    ASSERT_EQ(prec[0], gko::precision_reduction(0, 2));
+    // In single value, precision_reduction(1, 1) == precision_reduction(2, 0)
     ASSERT_EQ(prec[1], gko::precision_reduction(2, 0));
 #else
-    EXPECT_EQ(prec[0], gko::precision_reduction(1, 1));
+    ASSERT_EQ(prec[0], gko::precision_reduction(0, 2));
     ASSERT_EQ(prec[1], gko::precision_reduction(1, 1));
 #endif  // GINKGO_DPCPP_SINGLE_MODE
 }
@@ -646,7 +646,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithReducedPrecision)
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
 
-    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-6);
+    GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-5);
 }
 
 
@@ -893,7 +893,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithReducedPrecision)
     bj->apply(b.get(), x.get());
     d_bj->apply(d_b.get(), d_x.get());
 
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6);
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-5);
 }
 
 

From b8837fa5d5017bc63b06dfefc81d8a79194f5209 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 21 Jul 2022 11:51:52 +0200
Subject: [PATCH 09/11] workaround for cpu issue

---
 core/preconditioner/jacobi_utils.hpp          |  4 +-
 .../jacobi_generate_instantiate.inc.dp.cpp    | 65 ++++++++++++++++++-
 dpcpp/preconditioner/jacobi_kernels.dp.cpp    |  5 +-
 dpcpp/test/preconditioner/jacobi_kernels.cpp  | 57 ++++++++--------
 4 files changed, 97 insertions(+), 34 deletions(-)

diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp
index 302ec84f4ee..4534eca3660 100644
--- a/core/preconditioner/jacobi_utils.hpp
+++ b/core/preconditioner/jacobi_utils.hpp
@@ -89,13 +89,15 @@ struct precision_reduction_descriptor {
     static constexpr GKO_ATTRIBUTES uint32
     singleton(const precision_reduction& pr)
     {
+        // clang-format off
         return pr == precision_reduction(0, 0)   ? p0n0
                : pr == precision_reduction(0, 1) ? p0n1
                : pr == precision_reduction(0, 2) ? p0n2
                : pr == precision_reduction(1, 0) ? p1n0
                : pr == precision_reduction(1, 1) ? p1n1
                : pr == precision_reduction(2, 0) ? p2n0
-                                                 : p0n0;
+               : p0n0;
+        // clang-format on
     }
 };
 
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index eab5ec118e4..2f49d924f7b 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -178,6 +178,57 @@ void generate(
     });
 }
 
+namespace detail {
+
+
+/**
+ * TODO: Less threads involve the verificator1 when the function try calling the
+ * verificator1 twice on some CPU. Need to investigate furthermore and get the
+ * reproducer. The current workaround we pass the same lambda function as
+ * verificator1 to verificator3. Others is copied from
+ * core/preconditioner/jacobi_utils.hpp.
+ */
+template <typename ValueType, typename AccuracyType, typename CondType,
+          typename Predicate1, typename Predicate2, typename Predicate3>
+GKO_ATTRIBUTES GKO_INLINE uint32 get_supported_storage_reductions_dpcpp(
+    AccuracyType accuracy, CondType cond, Predicate1 verificator1,
+    Predicate2 verificator2, Predicate3 verificator3)
+{
+    using gko::detail::float_traits;
+    using type = remove_complex<ValueType>;
+    using prd = preconditioner::detail::precision_reduction_descriptor;
+    auto accurate = [&cond, &accuracy](type eps) {
+        return cond * eps < accuracy;
+    };
+    uint8 is_verified1 = 2;
+    auto supported = static_cast<uint32>(prd::p0n0);
+    // the following code uses short-circuiting to avoid calling possibly
+    // expensive verificatiors multiple times
+    if (accurate(float_traits<truncate_type<truncate_type<type>>>::eps)) {
+        supported |= prd::p2n0;
+    }
+    if (accurate(float_traits<truncate_type<reduce_precision<type>>>::eps) &&
+        (is_verified1 = verificator1())) {
+        supported |= prd::p1n1;
+    }
+    if (accurate(float_traits<reduce_precision<reduce_precision<type>>>::eps) &&
+        is_verified1 != 0 && verificator2()) {
+        supported |= prd::p0n2;
+    }
+    if (accurate(float_traits<truncate_type<type>>::eps)) {
+        supported |= prd::p1n0;
+    }
+    if (accurate(float_traits<reduce_precision<type>>::eps) &&
+        (is_verified1 == 1 ||
+         (is_verified1 == 2 && (is_verified1 = verificator3())))) {
+        supported |= prd::p0n1;
+    }
+    return supported;
+}
+
+
+}  // namespace detail
+
 
 template <int max_block_size, int subwarp_size, int warps_per_block,
           typename ValueType, typename IndexType>
@@ -225,8 +276,8 @@ void adaptive_generate(
             preconditioner::detail::precision_reduction_descriptor::singleton(
                 prec);
         if (prec == precision_reduction::autodetect()) {
-            using preconditioner::detail::get_supported_storage_reductions;
-            prec_descriptor = get_supported_storage_reductions<ValueType>(
+            using detail::get_supported_storage_reductions_dpcpp;
+            prec_descriptor = get_supported_storage_reductions_dpcpp<ValueType>(
                 accuracy, block_cond,
                 [&subwarp, &block_size, &row, &block_data, &storage_scheme,
                  &block_id] {
@@ -248,6 +299,16 @@ void adaptive_generate(
                         block_data +
                             storage_scheme.get_global_block_offset(block_id),
                         storage_scheme.get_stride());
+                },
+                [&subwarp, &block_size, &row, &block_data, &storage_scheme,
+                 &block_id] {
+                    using target = reduce_precision<ValueType>;
+                    return validate_precision_reduction_feasibility<
+                        max_block_size, target>(
+                        subwarp, block_size, row,
+                        block_data +
+                            storage_scheme.get_global_block_offset(block_id),
+                        storage_scheme.get_stride());
                 });
         }
     }
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index 8095fe9fe24..c90a3452969 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -63,8 +63,9 @@ namespace jacobi {
 namespace {
 
 
-// a total of 32 warps (1024 threads)
-constexpr int default_num_warps = 16;
+// a total of 8 32-subgroup (256 threads)
+constexpr int default_num_warps = 8;
+// TODO: get a default_grid_size for dpcpp
 // with current architectures, at most 32 warps can be scheduled per SM (and
 // current GPUs have at most 84 SMs)
 constexpr int default_grid_size = 32 * 32 * 128;
diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp
index a4e31d11d71..e4d63617eb7 100644
--- a/dpcpp/test/preconditioner/jacobi_kernels.cpp
+++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp
@@ -343,7 +343,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithDifferentBlockSize)
 
 TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithMPW)
 {
-    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32,
                     97, 99);
 
     auto bj = bj_factory->generate(mtx);
@@ -356,7 +356,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithMPW)
 
 TEST_F(Jacobi, DpcppTransposedPreconditionerEquivalentToRefWithMPW)
 {
-    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32,
                     97, 99);
 
     auto bj = bj_factory->generate(mtx);
@@ -370,7 +370,7 @@ TEST_F(Jacobi, DpcppTransposedPreconditionerEquivalentToRefWithMPW)
 
 TEST_F(Jacobi, DpcppConjTransposedPreconditionerEquivalentToRefWithMPW)
 {
-    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32,
                     97, 99);
 
     auto bj = bj_factory->generate(mtx);
@@ -411,7 +411,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithDifferentBlockSize)
 
 TEST_F(Jacobi, DpcppApplyEquivalentToRef)
 {
-    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32,
                     97, 99);
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -459,7 +459,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef)
 
 TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRef)
 {
-    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32,
                     97, 99);
     auto alpha = gko::initialize<Vec>({2.0}, ref);
     auto d_alpha = gko::initialize<Vec>({2.0}, dpcpp);
@@ -517,7 +517,7 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef)
 
 TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRef)
 {
-    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32,
                     97, 99, 5);
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -531,7 +531,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRef)
 
 TEST_F(Jacobi, DpcppLinearCombinationApplyToMultipleVectorsEquivalentToRef)
 {
-    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
+    initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32,
                     97, 99, 5);
     auto alpha = gko::initialize<Vec>({2.0}, ref);
     auto d_alpha = gko::initialize<Vec>({2.0}, dpcpp);
@@ -551,7 +551,7 @@ TEST_F(Jacobi, ComputesTheSameConditionNumberAsRef)
 {
     SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99);
+                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 32, 97, 99);
 
     auto bj = bj_factory->generate(mtx);
     auto d_bj = clone(ref, d_bj_factory->generate(mtx));
@@ -613,12 +613,11 @@ TEST_F(Jacobi, AvoidsPrecisionsThatOverflow)
     auto h_bj = clone(ref, bj);
     auto prec =
         h_bj->get_parameters().storage_optimization.block_wise.get_const_data();
-#if GINKGO_DPCPP_SINGLE_MODE
     ASSERT_EQ(prec[0], gko::precision_reduction(0, 2));
+#if GINKGO_DPCPP_SINGLE_MODE
     // In single value, precision_reduction(1, 1) == precision_reduction(2, 0)
     ASSERT_EQ(prec[1], gko::precision_reduction(2, 0));
 #else
-    ASSERT_EQ(prec[0], gko::precision_reduction(0, 2));
     ASSERT_EQ(prec[1], gko::precision_reduction(1, 1));
 #endif  // GINKGO_DPCPP_SINGLE_MODE
 }
@@ -640,7 +639,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithFullPrecision)
 TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithReducedPrecision)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97,
+                    {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 32, 97,
                     99);
 
     auto bj = bj_factory->generate(mtx);
@@ -653,7 +652,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithReducedPrecision)
 TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithCustomReducedPrecision)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97,
+                    {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 32, 97,
                     99);
 
     auto bj = bj_factory->generate(mtx);
@@ -666,7 +665,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithCustomReducedPrecision)
 TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithQuarteredPrecision)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 13, 97,
+                    {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 32, 97,
                     99);
 
     auto bj = bj_factory->generate(mtx);
@@ -679,7 +678,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithQuarteredPrecision)
 TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithCustomQuarteredPrecision)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 13, 97,
+                    {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 32, 97,
                     99);
 
     auto bj = bj_factory->generate(mtx);
@@ -693,7 +692,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithAdaptivePrecision)
 {
     SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 32, 97,
                     99);
 
     auto bj = bj_factory->generate(mtx);
@@ -708,7 +707,7 @@ TEST_F(Jacobi,
 {
     SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 32, 97,
                     99);
 
     auto bj = bj_factory->generate(mtx);
@@ -725,7 +724,7 @@ TEST_F(Jacobi,
 {
     SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 32, 97,
                     99);
 
     auto bj = bj_factory->generate(mtx);
@@ -741,7 +740,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithFullPrecision)
 {
     SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97,
+                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 32, 97,
                     99);
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -757,7 +756,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithReducedPrecision)
 {
     SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97,
+                    {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 32, 97,
                     99);
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -773,7 +772,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedPrecision)
 {
     SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97,
+                    {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 32, 97,
                     99);
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -788,7 +787,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedPrecision)
 TEST_F(Jacobi, DpcppApplyEquivalentToRefWithQuarteredPrecision)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 13, 97,
+                    {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 32, 97,
                     99);
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -803,7 +802,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithQuarteredPrecision)
 TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedAndReducedPrecision)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {up, up, up, up, up, up, up, up, up, up, up}, {}, 13, 97,
+                    {up, up, up, up, up, up, up, up, up, up, up}, {}, 32, 97,
                     99);
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -818,7 +817,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedAndReducedPrecision)
 TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomQuarteredPrecision)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 13, 97,
+                    {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 32, 97,
                     99);
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -834,7 +833,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithAdaptivePrecision)
 {
     SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 32, 97,
                     99);
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -850,7 +849,7 @@ TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRefWithAdaptivePrecision)
 {
     SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97,
+                    {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 32, 97,
                     99);
     auto alpha = gko::initialize<Vec>({2.0}, ref);
     auto d_alpha = gko::initialize<Vec>({2.0}, dpcpp);
@@ -870,7 +869,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithFullPrecision)
 {
     SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97,
+                    {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 32, 97,
                     99, 5);
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -885,7 +884,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithFullPrecision)
 TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithReducedPrecision)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97,
+                    {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 32, 97,
                     99, 5);
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -901,7 +900,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision)
 {
     SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97,
+                    {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 32, 97,
                     99, 5);
     auto bj = bj_factory->generate(mtx);
     auto d_bj = d_bj_factory->generate(mtx);
@@ -919,7 +918,7 @@ TEST_F(
 {
     SKIP_IF_SINGLE_MODE;
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100},
-                    {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97,
+                    {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 32, 97,
                     99, 5);
     auto alpha = gko::initialize<Vec>({2.0}, ref);
     auto d_alpha = gko::initialize<Vec>({2.0}, dpcpp);

From 4ed16c63f32ce81fcaa8eca92c7e1484ee9988a3 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 1 Sep 2022 09:51:38 -0400
Subject: [PATCH 10/11] remove unused parameter, update doc

Co-authored-by: Terry Cojean <terry.cojean@kit.edu>
---
 .../preconditioner/jacobi_kernels.hpp.inc     |  3 +--
 cuda/preconditioner/jacobi_kernels.cu         |  2 +-
 dpcpp/CMakeLists.txt                          | 10 ++--------
 dpcpp/preconditioner/jacobi_kernels.dp.cpp    | 20 +++++++++----------
 hip/preconditioner/jacobi_kernels.hip.cpp     |  7 +++----
 5 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
index bb1ac5f62bb..9f2d7f043d0 100644
--- a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
+++ b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
@@ -30,9 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-template <int warps_per_block>
 __global__
-__launch_bounds__(warps_per_block* config::warp_size) void duplicate_array(
+__launch_bounds__(default_num_warps* config::warp_size) void duplicate_array(
     const precision_reduction* __restrict__ source, size_type source_size,
     precision_reduction* __restrict__ dest, size_type dest_size)
 {
diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu
index b9eddae340f..2b5a887043c 100644
--- a/cuda/preconditioner/jacobi_kernels.cu
+++ b/cuda/preconditioner/jacobi_kernels.cu
@@ -124,7 +124,7 @@ void initialize_precisions(std::shared_ptr<const DefaultExecutor> exec,
         default_grid_size,
         static_cast<int32>(ceildiv(precisions.get_num_elems(), block_size)));
     if (grid_size > 0) {
-        duplicate_array<default_num_warps><<<grid_size, block_size>>>(
+        duplicate_array<<<grid_size, block_size>>>(
             source.get_const_data(), source.get_num_elems(),
             precisions.get_data(), precisions.get_num_elems());
     }
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index dd28a79adeb..e0f9806cf96 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -57,14 +57,8 @@ target_sources(ginkgo_dpcpp
     ${GKO_UNIFIED_COMMON_SOURCES}
     )
 
-# if(GINKGO_JACOBI_FULL_OPTIMIZATIONS)
-#     set(GKO_DPCPP_JACOBI_BLOCK_SIZES)
-#     foreach(blocksize RANGE 1 32)
-#         list(APPEND GKO_DPCPP_JACOBI_BLOCK_SIZES ${blocksize})
-#     endforeach()
-# else()
-    set(GKO_DPCPP_JACOBI_BLOCK_SIZES 32)
-# endif()
+# TODO: adjust it when dpcpp jacobi supports more block size
+set(GKO_DPCPP_JACOBI_BLOCK_SIZES 32)
 set(GKO_DPCPP_JACOBI_SOURCES)
 foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_DPCPP_JACOBI_BLOCK_SIZES)
     configure_file(
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index c90a3452969..2b9d64da510 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -71,7 +71,6 @@ constexpr int default_num_warps = 8;
 constexpr int default_grid_size = 32 * 32 * 128;
 
 
-template <int warps_per_block>
 void duplicate_array(const precision_reduction* __restrict__ source,
                      size_type source_size,
                      precision_reduction* __restrict__ dest,
@@ -86,17 +85,15 @@ void duplicate_array(const precision_reduction* __restrict__ source,
     }
 }
 
-template <int warps_per_block>
 void duplicate_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                      sycl::queue* queue, const precision_reduction* source,
                      size_type source_size, precision_reduction* dest,
                      size_type dest_size)
 {
-    queue->parallel_for(sycl_nd_range(grid, block),
-                        [=](sycl::nd_item<3> item_ct1) {
-                            duplicate_array<warps_per_block>(
-                                source, source_size, dest, dest_size, item_ct1);
-                        });
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            duplicate_array(source, source_size, dest, dest_size, item_ct1);
+        });
 }
 
 
@@ -404,10 +401,11 @@ void initialize_precisions(std::shared_ptr<const DefaultExecutor> exec,
     const auto grid_size = min(
         default_grid_size,
         static_cast<int32>(ceildiv(precisions.get_num_elems(), block_size)));
-    duplicate_array<default_num_warps>(
-        grid_size, block_size, 0, exec->get_queue(), source.get_const_data(),
-        source.get_num_elems(), precisions.get_data(),
-        precisions.get_num_elems());
+    if (grid_size > 0) {
+        duplicate_array(grid_size, block_size, 0, exec->get_queue(),
+                        source.get_const_data(), source.get_num_elems(),
+                        precisions.get_data(), precisions.get_num_elems());
+    }
 }
 
 
diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp
index d8fc7677e8e..595a3e564b8 100644
--- a/hip/preconditioner/jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_kernels.hip.cpp
@@ -133,10 +133,9 @@ void initialize_precisions(std::shared_ptr<const HipExecutor> exec,
         default_grid_size,
         static_cast<int32>(ceildiv(precisions.get_num_elems(), block_size)));
     if (grid_size > 0) {
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(duplicate_array<default_num_warps>),
-                           grid_size, block_size, 0, 0, source.get_const_data(),
-                           source.get_num_elems(), precisions.get_data(),
-                           precisions.get_num_elems());
+        hipLaunchKernelGGL(duplicate_array, grid_size, block_size, 0, 0,
+                           source.get_const_data(), source.get_num_elems(),
+                           precisions.get_data(), precisions.get_num_elems());
     }
 }
 

From ce70658f7c87492988094880a049973ab1f10e6e Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 31 Oct 2022 00:22:26 +0800
Subject: [PATCH 11/11] fix header and type issue, and format_header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
---
 .../preconditioner/jacobi_generate_kernel.hpp.inc         | 4 ++--
 dev_tools/scripts/format_header.sh                        | 4 ++--
 .../jacobi_advanced_apply_instantiate.inc.dp.cpp          | 4 +---
 dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp  | 4 +---
 dpcpp/preconditioner/jacobi_common.hpp.in                 | 2 +-
 .../preconditioner/jacobi_generate_instantiate.inc.dp.cpp | 8 +++-----
 dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp        | 4 +---
 dpcpp/preconditioner/jacobi_kernels.dp.cpp                | 4 +---
 .../jacobi_simple_apply_instantiate.inc.dp.cpp            | 4 +---
 dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp    | 4 +---
 dpcpp/test/preconditioner/jacobi_kernels.cpp              | 4 ++++
 11 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc
index 208bd33d7dc..e863a9631a1 100644
--- a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc
+++ b/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc
@@ -43,7 +43,7 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility(
     // save original data and reduce precision
     if (group.thread_rank() < block_size) {
 #pragma unroll
-        for (auto i = 0u; i < max_block_size; ++i) {
+        for (int i = 0; i < max_block_size; ++i) {
             if (i < block_size) {
                 work[i * stride + group.thread_rank()] = row[i];
                 row[i] =
@@ -65,7 +65,7 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility(
     // restore original data
     if (group.thread_rank() < block_size) {
 #pragma unroll
-        for (auto i = 0u; i < max_block_size; ++i) {
+        for (int i = 0; i < max_block_size; ++i) {
             if (i < block_size) {
                 row[i] = work[i * stride + group.thread_rank()];
             }
diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh
index 05860d301e3..a501b6f97d2 100755
--- a/dev_tools/scripts/format_header.sh
+++ b/dev_tools/scripts/format_header.sh
@@ -4,7 +4,7 @@ CLANG_FORMAT=${CLANG_FORMAT:="clang-format"}
 
 convert_header () {
     local regex="^(#include )(<|\")(.*)(\"|>)$"
-    local jacobi_regex="^(cuda|hip)\/preconditioner\/jacobi_common(\.hip)?\.hpp"
+    local jacobi_regex="^(cuda|hip|dpcpp)\/preconditioner\/jacobi_common(\.hip)?\.hpp"
     if [[ $@ =~ ${regex} ]]; then
         header_file="${BASH_REMATCH[3]}"
         if [ -f "${header_file}" ]; then
@@ -15,7 +15,7 @@ convert_header () {
             fi
         elif [ "${header_file}" = "matrices/config.hpp" ]; then
             echo "#include \"${header_file}\""
-	elif [[ "${header_file}" =~ ${jacobi_regex} ]]; then
+	    elif [[ "${header_file}" =~ ${jacobi_regex} ]]; then
             echo "#include \"${header_file}\""
         else
             echo "#include <${header_file}>"
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
index 2235d886234..29a7cb76f10 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
@@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <dpcpp/preconditioner/jacobi_common.hpp>
-
-
 #include <CL/sycl.hpp>
 
 
@@ -51,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/components/warp_blas.dp.hpp"
+#include "dpcpp/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
index 93565a3f16f..57ee2e36263 100644
--- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp
@@ -33,14 +33,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <dpcpp/preconditioner/jacobi_common.hpp>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/dpcpp/preconditioner/jacobi_common.hpp.in b/dpcpp/preconditioner/jacobi_common.hpp.in
index 93925a50342..fa5170100c3 100644
--- a/dpcpp/preconditioner/jacobi_common.hpp.in
+++ b/dpcpp/preconditioner/jacobi_common.hpp.in
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
index 2f49d924f7b..da6d2bf2825 100644
--- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp
@@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <dpcpp/preconditioner/jacobi_common.hpp>
-
-
 #include <CL/sycl.hpp>
 
 
@@ -55,6 +52,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/components/uninitialized_array.hpp"
 #include "dpcpp/components/warp_blas.dp.hpp"
+#include "dpcpp/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
@@ -81,7 +79,7 @@ __dpct_inline__ bool validate_precision_reduction_feasibility(
     // save original data and reduce precision
     if (group.thread_rank() < block_size) {
 #pragma unroll
-        for (auto i = 0u; i < max_block_size; ++i) {
+        for (int i = 0; i < max_block_size; ++i) {
             if (i < block_size) {
                 work[i * stride + group.thread_rank()] = row[i];
                 row[i] =
@@ -103,7 +101,7 @@ __dpct_inline__ bool validate_precision_reduction_feasibility(
     // restore original data
     if (group.thread_rank() < block_size) {
 #pragma unroll
-        for (auto i = 0u; i < max_block_size; ++i) {
+        for (int i = 0; i < max_block_size; ++i) {
             if (i < block_size) {
                 row[i] = work[i * stride + group.thread_rank()];
             }
diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
index e580719a4b8..f3ef6902e1a 100644
--- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp
@@ -33,15 +33,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <dpcpp/preconditioner/jacobi_common.hpp>
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
 #include "core/components/fill_array_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index 2b9d64da510..54bb19728e6 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <dpcpp/preconditioner/jacobi_common.hpp>
-
-
 #include <CL/sycl.hpp>
 
 
@@ -49,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
index f7c7aa2573b..aaffb1b6583 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
@@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <dpcpp/preconditioner/jacobi_common.hpp>
-
-
 #include <CL/sycl.hpp>
 
 
@@ -51,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/components/warp_blas.dp.hpp"
+#include "dpcpp/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
index 37519d96779..d260298783c 100644
--- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp
@@ -33,14 +33,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <dpcpp/preconditioner/jacobi_common.hpp>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp
index e4d63617eb7..889548fd2e1 100644
--- a/dpcpp/test/preconditioner/jacobi_kernels.cpp
+++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp
@@ -33,12 +33,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 
 
+#include <initializer_list>
 #include <random>
+#include <vector>
 
 
 #include <gtest/gtest.h>
 
 
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>