From 96b16778c7ddcc86785e341ed23e48809fe24a14 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 24 Nov 2021 17:43:59 +0800 Subject: [PATCH 01/11] script port --- ...cobi_advanced_apply_instantiate.inc.dp.cpp | 237 ++++++++ .../jacobi_advanced_apply_kernel.dp.cpp | 106 ++++ dpcpp/preconditioner/jacobi_common.hpp.in | 71 +++ .../jacobi_generate_instantiate.inc.dp.cpp | 364 ++++++++++++ .../jacobi_generate_kernel.dp.cpp | 103 ++++ dpcpp/preconditioner/jacobi_kernels.dp.cpp | 558 +++++++++++++----- ...jacobi_simple_apply_instantiate.inc.dp.cpp | 229 +++++++ .../jacobi_simple_apply_kernel.dp.cpp | 102 ++++ 8 files changed, 1616 insertions(+), 154 deletions(-) create mode 100644 dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp create mode 100644 dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp create mode 100644 dpcpp/preconditioner/jacobi_common.hpp.in create mode 100644 dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp create mode 100644 dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp create mode 100644 dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp create mode 100644 dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp new file mode 100644 index 00000000000..4425f8e277b --- /dev/null +++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp @@ -0,0 +1,237 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include + + +#include + + +#include "core/base/extended_float.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/math.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/components/warp_blas.dp.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +namespace kernel { + + +template +void advanced_apply( + const ValueType* __restrict__ blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const IndexType* __restrict__ block_ptrs, size_type num_blocks, + const ValueType* __restrict__ alpha, const ValueType* __restrict__ b, + int32 b_stride, ValueType* __restrict__ x, int32 x_stride, + sycl::nd_item<3> item_ct1) +{ + const auto block_id = + thread::get_subwarp_id(item_ct1); + const auto subwarp = group::tiled_partition( + group::this_thread_block(item_ct1)); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + ValueType v = zero(); + if (subwarp.thread_rank() < block_size) { + v = alpha[0] * + b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; + } + multiply_vec( + subwarp, block_size, v, + blocks + storage_scheme.get_global_block_offset(block_id) + + subwarp.thread_rank(), + storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, + x_stride, + [](ValueType& result, const ValueType& out) { result += out; }); +} + +template +void advanced_apply( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + const ValueType* blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const IndexType* block_ptrs, size_type num_blocks, const ValueType* alpha, + const ValueType* b, int32 b_stride, ValueType* x, int32 x_stride) +{ + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + advanced_apply( + blocks, storage_scheme, block_ptrs, num_blocks, alpha, b, + b_stride, x, x_stride, item_ct1); + }); +} + + +template +void advanced_adaptive_apply( + const ValueType* __restrict__ blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const precision_reduction* __restrict__ block_precisions, + const IndexType* __restrict__ block_ptrs, size_type num_blocks, + const ValueType* __restrict__ alpha, const ValueType* __restrict__ b, + int32 b_stride, ValueType* __restrict__ x, int32 x_stride, + sycl::nd_item<3> item_ct1) +{ + const auto block_id = + thread::get_subwarp_id(item_ct1); + const auto subwarp = group::tiled_partition( + group::this_thread_block(item_ct1)); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + auto alpha_val = alpha == nullptr ? one() : alpha[0]; + ValueType v = zero(); + if (subwarp.thread_rank() < block_size) { + v = alpha[0] * + b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; + } + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, block_precisions[block_id], + multiply_vec( + subwarp, block_size, v, + reinterpret_cast( + blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id) + + subwarp.thread_rank(), + storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, + x_stride, + [](ValueType& result, const ValueType& out) { result += out; })); +} + +template +void advanced_adaptive_apply( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + const ValueType* blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const precision_reduction* block_precisions, const IndexType* block_ptrs, + size_type num_blocks, const ValueType* alpha, const ValueType* b, + int32 b_stride, ValueType* x, int32 x_stride) +{ + queue->parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> + item_ct1) { + advanced_adaptive_apply( + blocks, storage_scheme, block_precisions, block_ptrs, num_blocks, + alpha, b, b_stride, x, x_stride, item_ct1); + }); +} + + +} // namespace kernel + + +// clang-format off +//#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_JACOBI_BLOCK_SIZE +#define GKO_JACOBI_BLOCK_SIZE 1 +#endif + + +template +void advanced_apply( + syn::value_list, size_type num_blocks, + const precision_reduction* block_precisions, + const IndexType* block_pointers, const ValueType* blocks, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + const ValueType* alpha, const ValueType* b, size_type b_stride, + ValueType* x, size_type x_stride) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + kernel::advanced_adaptive_apply( + grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme, + block_precisions, block_pointers, num_blocks, alpha, b, b_stride, x, + x_stride); + } else { + kernel::advanced_apply( + grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme, + block_pointers, num_blocks, alpha, b, b_stride, x, x_stride); + } +} + + +#define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType) \ + void advanced_apply( \ + syn::value_list, size_type, \ + const precision_reduction*, const IndexType* block_pointers, \ + const ValueType*, \ + const preconditioner::block_interleaved_storage_scheme&, \ + const ValueType*, const ValueType*, size_type, ValueType*, size_type) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION); + + +} // namespace jacobi +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp new file mode 100644 index 00000000000..1db62f49db3 --- /dev/null +++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp @@ -0,0 +1,106 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include + + +#include "core/matrix/dense_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +template +void advanced_apply( + syn::value_list, size_type num_blocks, + const precision_reduction* block_precisions, + const IndexType* block_pointers, const ValueType* blocks, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + const ValueType* alpha, const ValueType* b, size_type b_stride, + ValueType* x, size_type x_stride); + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_advanced_apply, advanced_apply); + + +template +void apply(std::shared_ptr exec, size_type num_blocks, + uint32 max_block_size, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + const Array& block_precisions, + const Array& block_pointers, + const Array& blocks, + const matrix::Dense* alpha, + const matrix::Dense* b, + const matrix::Dense* beta, matrix::Dense* x) +{ + // TODO: write a special kernel for multiple RHS + dense::scale(exec, beta, x); + for (size_type col = 0; col < b->get_size()[1]; ++col) { + select_advanced_apply( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), + syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + block_pointers.get_const_data(), blocks.get_const_data(), + storage_scheme, alpha->get_const_values(), + b->get_const_values() + col, b->get_stride(), x->get_values() + col, + x->get_stride()); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL); + + +} // namespace jacobi +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/preconditioner/jacobi_common.hpp.in b/dpcpp/preconditioner/jacobi_common.hpp.in new file mode 100644 index 00000000000..93925a50342 --- /dev/null +++ b/dpcpp/preconditioner/jacobi_common.hpp.in @@ -0,0 +1,71 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include + + +#include "dpcpp/base/config.hpp" + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace jacobi { + + +/** + * A compile-time list of block sizes for which dedicated generate and apply + * kernels should be compiled. + */ +// clang-format off +#cmakedefine GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE @GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE +#define GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE 1 +#endif + + +using compiled_kernels = + syn::value_list; + + +constexpr int get_larger_power(int value, int guess = 1) +{ + return guess >= value ? guess : get_larger_power(value, guess << 1); +} + + +} // namespace jacobi +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp new file mode 100644 index 00000000000..2a759b743d8 --- /dev/null +++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp @@ -0,0 +1,364 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include +#include + + +#include + + +#include +#include + + +#include "core/base/extended_float.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/math.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/components/uninitialized_array.hpp" +#include "dpcpp/components/warp_blas.dp.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +namespace kernel { + + +template +__dpct_inline__ bool validate_precision_reduction_feasibility( + Group& __restrict__ group, IndexType block_size, + ValueType* __restrict__ row, ValueType* __restrict__ work, size_type stride) +{ + using gko::detail::float_traits; + // save original data and reduce precision + if (group.thread_rank() < block_size) { +#pragma unroll + for (auto i = 0u; i < max_block_size; ++i) { + if (i < block_size) { + work[i * stride + group.thread_rank()] = row[i]; + row[i] = + static_cast(static_cast(row[i])); + } + } + } + + // compute the condition number + uint32 perm = group.thread_rank(); + uint32 trans_perm = perm; + auto block_cond = compute_infinity_norm(group, block_size, + block_size, row); + auto succeeded = invert_block( + group, static_cast(block_size), row, perm, trans_perm); + block_cond *= compute_infinity_norm(group, block_size, + block_size, row); + + // restore original data + if (group.thread_rank() < block_size) { +#pragma unroll + for (auto i = 0u; i < max_block_size; ++i) { + if (i < block_size) { + row[i] = work[i * stride + group.thread_rank()]; + } + } + } + + return succeeded && block_cond >= 1.0 && + block_cond * float_traits>::eps < 1e-3; +} + + +template +void generate( + size_type num_rows, const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, + const ValueType* __restrict__ values, ValueType* __restrict__ block_data, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const IndexType* __restrict__ block_ptrs, size_type num_blocks, + sycl::nd_item<3> item_ct1, + UninitializedArray* workspace) +{ + const auto block_id = + thread::get_subwarp_id(item_ct1); + const auto block = group::this_thread_block(item_ct1); + ValueType row[max_block_size]; + + csr::extract_transposed_diag_blocks( + block, config::warp_size / subwarp_size, row_ptrs, col_idxs, values, + block_ptrs, num_blocks, row, 1, + *workspace + item_ct1.get_local_id(0) * max_block_size); + const auto subwarp = group::tiled_partition(block); + if (block_id < num_blocks) { + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + uint32 perm = subwarp.thread_rank(); + uint32 trans_perm = subwarp.thread_rank(); + invert_block(subwarp, static_cast(block_size), + row, perm, trans_perm); + copy_matrix( + subwarp, block_size, row, 1, perm, trans_perm, + block_data + storage_scheme.get_global_block_offset(block_id), + storage_scheme.get_stride()); + } +} + +template +void generate( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs, + const ValueType* values, ValueType* block_data, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const IndexType* block_ptrs, size_type num_blocks) +{ + queue->submit([&](sycl::handler& cgh) { + sycl::accessor< + UninitializedArray, 0, + sycl::access_mode::read_write, sycl::access::target::local> + workspace_acc_ct1(cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + generate( + num_rows, row_ptrs, col_idxs, values, block_data, + storage_scheme, block_ptrs, num_blocks, item_ct1, + workspace_acc_ct1.get_pointer().get()); + }); + }); +} + + +template +void adaptive_generate( + size_type num_rows, const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, + const ValueType* __restrict__ values, remove_complex accuracy, + ValueType* __restrict__ block_data, + preconditioner::block_interleaved_storage_scheme storage_scheme, + remove_complex* __restrict__ conditioning, + precision_reduction* __restrict__ block_precisions, + const IndexType* __restrict__ block_ptrs, size_type num_blocks, + sycl::nd_item<3> item_ct1, + UninitializedArray* workspace) +{ + // extract blocks + const auto block_id = + thread::get_subwarp_id(item_ct1); + const auto block = group::this_thread_block(item_ct1); + ValueType row[max_block_size]; + + csr::extract_transposed_diag_blocks( + block, config::warp_size / subwarp_size, row_ptrs, col_idxs, values, + block_ptrs, num_blocks, row, 1, + *workspace + item_ct1.get_local_id(0) * max_block_size); + + // compute inverse and figure out the correct precision + const auto subwarp = group::tiled_partition(block); + const uint32 block_size = + block_id < num_blocks ? block_ptrs[block_id + 1] - block_ptrs[block_id] + : 0; + uint32 perm = subwarp.thread_rank(); + uint32 trans_perm = subwarp.thread_rank(); + auto prec_descriptor = ~uint32{}; + if (block_id < num_blocks) { + auto block_cond = compute_infinity_norm( + subwarp, block_size, block_size, row); + invert_block(subwarp, block_size, row, perm, + trans_perm); + block_cond *= compute_infinity_norm(subwarp, block_size, + block_size, row); + conditioning[block_id] = block_cond; + const auto prec = block_precisions[block_id]; + prec_descriptor = + preconditioner::detail::precision_reduction_descriptor::singleton( + prec); + if (prec == precision_reduction::autodetect()) { + using preconditioner::detail::get_supported_storage_reductions; + prec_descriptor = get_supported_storage_reductions( + accuracy, block_cond, + [&subwarp, &block_size, &row, &block_data, &storage_scheme, + &block_id] { + using target = reduce_precision; + return validate_precision_reduction_feasibility< + max_block_size, target>( + subwarp, block_size, row, + block_data + + storage_scheme.get_global_block_offset(block_id), + storage_scheme.get_stride()); + }, + [&subwarp, &block_size, &row, &block_data, &storage_scheme, + &block_id] { + using target = + reduce_precision>; + return validate_precision_reduction_feasibility< + max_block_size, target>( + subwarp, block_size, row, + block_data + + storage_scheme.get_global_block_offset(block_id), + storage_scheme.get_stride()); + }); + } + } + + // make sure all blocks in the group have the same precision + const auto warp = group::tiled_partition(block); + const auto prec = + preconditioner::detail::get_optimal_storage_reduction(reduce( + warp, prec_descriptor, [](uint32 x, uint32 y) { return x & y; })); + + // store the block back into memory + if (block_id < num_blocks) { + block_precisions[block_id] = prec; + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, prec, + copy_matrix( + subwarp, block_size, row, 1, perm, trans_perm, + reinterpret_cast( + block_data + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id), + storage_scheme.get_stride())); + } +} + +template +void adaptive_generate( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs, + const ValueType* values, remove_complex accuracy, + ValueType* block_data, + preconditioner::block_interleaved_storage_scheme storage_scheme, + remove_complex* conditioning, + precision_reduction* block_precisions, const IndexType* block_ptrs, + size_type num_blocks) +{ + queue->submit([&](sycl::handler& cgh) { + sycl::accessor< + UninitializedArray, 0, + sycl::access_mode::read_write, sycl::access::target::local> + workspace_acc_ct1(cgh); + + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> + item_ct1) { + adaptive_generate( + num_rows, row_ptrs, col_idxs, values, accuracy, block_data, + storage_scheme, conditioning, block_precisions, block_ptrs, + num_blocks, item_ct1, workspace_acc_ct1.get_pointer().get()); + }); + }); +} + + +} // namespace kernel + + +// clang-format off +//#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_JACOBI_BLOCK_SIZE +#define GKO_JACOBI_BLOCK_SIZE 1 +#endif + + +template +void generate(syn::value_list, + const matrix::Csr* mtx, + remove_complex accuracy, ValueType* block_data, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + remove_complex* conditioning, + precision_reduction* block_precisions, + const IndexType* block_ptrs, size_type num_blocks) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + kernel::adaptive_generate( + grid_size, block_size, 0, exec->get_queue(), mtx->get_size()[0], + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + mtx->get_const_values(), accuracy, block_data, storage_scheme, + conditioning, block_precisions, block_ptrs, num_blocks); + } else { + kernel::generate( + grid_size, block_size, 0, exec->get_queue(), mtx->get_size()[0], + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + mtx->get_const_values(), block_data, storage_scheme, block_ptrs, + num_blocks); + } +} + + +#define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType) \ + void generate( \ + syn::value_list, \ + const matrix::Csr*, remove_complex, \ + ValueType*, \ + const preconditioner::block_interleaved_storage_scheme&, \ + remove_complex*, precision_reduction*, const IndexType*, \ + size_type) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + DECLARE_JACOBI_GENERATE_INSTANTIATION); + + +} // namespace jacobi +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp new file mode 100644 index 00000000000..22e3f587748 --- /dev/null +++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp @@ -0,0 +1,103 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include +#include + + +#include "core/components/fill_array_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +template +void generate(syn::value_list, + const matrix::Csr* mtx, + remove_complex accuracy, ValueType* block_data, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + remove_complex* conditioning, + precision_reduction* block_precisions, + const IndexType* block_ptrs, size_type num_blocks); + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generate, generate); + + +template +void generate(std::shared_ptr exec, + const matrix::Csr* system_matrix, + size_type num_blocks, uint32 max_block_size, + remove_complex accuracy, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + Array>& conditioning, + Array& block_precisions, + const Array& block_pointers, Array& blocks) +{ + components::fill_array(exec, blocks.get_data(), blocks.get_num_elems(), + zero()); + select_generate( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), syn::type_list<>(), + system_matrix, accuracy, blocks.get_data(), storage_scheme, + conditioning.get_data(), block_precisions.get_data(), + block_pointers.get_const_data(), num_blocks); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_GENERATE_KERNEL); + + +} // namespace jacobi +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index 5406eaaf6cf..71c8101a8ee 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -33,26 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/jacobi_kernels.hpp" -#include -#include -#include -#include -#include +#include #include #include -#include -#include -#include -#include "core/base/allocator.hpp" #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" -#include "dpcpp/components/matrix_operations.dp.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/math.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" namespace gko { @@ -60,185 +57,416 @@ namespace kernels { namespace dpcpp { /** * @brief The Jacobi preconditioner namespace. - * + * @ref Jacobi * @ingroup jacobi */ namespace jacobi { - - -void initialize_precisions(std::shared_ptr exec, - const array& source, - array& precisions) - GKO_NOT_IMPLEMENTED; - - namespace { -template -inline bool has_same_nonzero_pattern( - const IndexType* prev_row_ptr, const IndexType* curr_row_ptr, - const IndexType* next_row_ptr) GKO_NOT_IMPLEMENTED; - - -template -size_type find_natural_blocks(const matrix::Csr* mtx, - uint32 max_block_size, - IndexType* block_ptrs) GKO_NOT_IMPLEMENTED; +// a total of 32 warps (1024 threads) +constexpr int default_num_warps = 32; +// with current architectures, at most 32 warps can be scheduled per SM (and +// current GPUs have at most 84 SMs) +constexpr int default_grid_size = 32 * 32 * 128; + + +template +void duplicate_array(const precision_reduction* __restrict__ source, + size_type source_size, + precision_reduction* __restrict__ dest, + size_type dest_size, sycl::nd_item<3> item_ct1) +{ + auto grid = group::this_grid(item_ct1); + if (grid.thread_rank() >= dest_size) { + return; + } + for (auto i = grid.thread_rank(); i < dest_size; i += grid.size()) { + dest[i] = source[i % source_size]; + } +} + +template +void duplicate_array(dim3 grid, dim3 block, size_type dynamic_shared_memory, + sycl::queue* queue, const precision_reduction* source, + size_type source_size, precision_reduction* dest, + size_type dest_size) +{ + queue->parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + duplicate_array( + source, source_size, dest, dest_size, item_ct1); + }); +} template -inline size_type agglomerate_supervariables( - uint32 max_block_size, size_type num_natural_blocks, - IndexType* block_ptrs) GKO_NOT_IMPLEMENTED; - - -} // namespace - - -template -void find_blocks(std::shared_ptr exec, - const matrix::Csr* system_matrix, - uint32 max_block_size, size_type& num_blocks, - array& block_pointers) GKO_NOT_IMPLEMENTED; +void compare_adjacent_rows(size_type num_rows, int32 max_block_size, + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idx, + bool* __restrict__ matching_next_row, + sycl::nd_item<3> item_ct1) +{ + const auto warp = group::tiled_partition( + group::this_thread_block(item_ct1)); + const auto local_tid = warp.thread_rank(); + const auto warp_id = + thread::get_subwarp_id_flat(item_ct1); + + if (warp_id >= num_rows - 1) { + return; + } + + const auto curr_row_start = row_ptrs[warp_id]; + const auto next_row_start = row_ptrs[warp_id + 1]; + const auto next_row_end = row_ptrs[warp_id + 2]; + + const auto nz_this_row = next_row_end - next_row_start; + const auto nz_prev_row = next_row_start - curr_row_start; + + if (nz_this_row != nz_prev_row) { + matching_next_row[warp_id] = false; + return; + } + size_type steps = ceildiv(nz_this_row, config::warp_size); + for (size_type i = 0; i < steps; i++) { + auto j = local_tid + i * config::warp_size; + auto prev_col = (curr_row_start + j < next_row_start) + ? col_idx[curr_row_start + j] + : 0; + auto this_col = (curr_row_start + j < next_row_start) + ? col_idx[next_row_start + j] + : 0; + if (warp.any(prev_col != this_col)) { + matching_next_row[warp_id] = false; + return; + } + } + matching_next_row[warp_id] = true; +} -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL); +template +void compare_adjacent_rows(dim3 grid, dim3 block, + size_type dynamic_shared_memory, sycl::queue* queue, + size_type num_rows, int32 max_block_size, + const IndexType* row_ptrs, const IndexType* col_idx, + bool* matching_next_row) +{ + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + compare_adjacent_rows(num_rows, max_block_size, row_ptrs, col_idx, + matching_next_row, item_ct1); + }); +} -namespace { +template +void generate_natural_block_pointer(size_type num_rows, int32 max_block_size, + const bool* __restrict__ matching_next_row, + IndexType* __restrict__ block_ptrs, + size_type* __restrict__ num_blocks_arr) +{ + block_ptrs[0] = 0; + if (num_rows == 0) { + return; + } + size_type num_blocks = 1; + int32 current_block_size = 1; + for (size_type i = 0; i < num_rows - 1; ++i) { + if ((matching_next_row[i]) && (current_block_size < max_block_size)) { + ++current_block_size; + } else { + block_ptrs[num_blocks] = + block_ptrs[num_blocks - 1] + current_block_size; + ++num_blocks; + current_block_size = 1; + } + } + block_ptrs[num_blocks] = block_ptrs[num_blocks - 1] + current_block_size; + num_blocks_arr[0] = num_blocks; +} +template +void generate_natural_block_pointer( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + size_type num_rows, int32 max_block_size, const bool* matching_next_row, + IndexType* block_ptrs, size_type* num_blocks_arr) +{ + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + generate_natural_block_pointer(num_rows, max_block_size, + matching_next_row, block_ptrs, + num_blocks_arr); + }); +} -template -inline void extract_block(const matrix::Csr* mtx, - IndexType block_size, IndexType block_start, - ValueType* block, - size_type stride) GKO_NOT_IMPLEMENTED; +template +void agglomerate_supervariables_kernel(int32 max_block_size, + size_type num_natural_blocks, + IndexType* __restrict__ block_ptrs, + size_type* __restrict__ num_blocks_arr) +{ + num_blocks_arr[0] = 0; + if (num_natural_blocks == 0) { + return; + } + size_type num_blocks = 1; + int32 current_block_size = block_ptrs[1] - block_ptrs[0]; + for (size_type i = 1; i < num_natural_blocks; ++i) { + const int32 block_size = block_ptrs[i + 1] - block_ptrs[i]; + if (current_block_size + block_size <= max_block_size) { + current_block_size += block_size; + } else { + block_ptrs[num_blocks] = block_ptrs[i]; + ++num_blocks; + current_block_size = block_size; + } + } + block_ptrs[num_blocks] = block_ptrs[num_natural_blocks]; + num_blocks_arr[0] = num_blocks; +} -template -inline IndexType choose_pivot(IndexType block_size, const ValueType* block, - size_type stride) GKO_NOT_IMPLEMENTED; +template +void agglomerate_supervariables_kernel(dim3 grid, dim3 block, + size_type dynamic_shared_memory, + sycl::queue* queue, int32 max_block_size, + size_type num_natural_blocks, + IndexType* block_ptrs, + size_type* num_blocks_arr) +{ + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + agglomerate_supervariables_kernel( + max_block_size, num_natural_blocks, block_ptrs, num_blocks_arr); + }); +} + + +template +void transpose_jacobi( + const ValueType* __restrict__ blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const IndexType* __restrict__ block_ptrs, size_type num_blocks, + ValueType* __restrict__ out_blocks, sycl::nd_item<3> item_ct1) +{ + const auto block_id = + thread::get_subwarp_id(item_ct1); + const auto subwarp = group::tiled_partition( + group::this_thread_block(item_ct1)); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + + const auto block_ofs = storage_scheme.get_global_block_offset(block_id); + const auto block_stride = storage_scheme.get_stride(); + const auto rank = subwarp.thread_rank(); + if (rank < block_size) { + for (IndexType i = 0; i < block_size; ++i) { + auto val = blocks[block_ofs + i * block_stride + rank]; + /* + DPCT1007:4: Migration of this CUDA API is not supported by the + Intel(R) DPC++ Compatibility Tool. + */ + out_blocks[block_ofs + i + rank * block_stride] = + conjugate ? conj(val) : val; + } + } +} + +template +void transpose_jacobi( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + const ValueType* blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const IndexType* block_ptrs, size_type num_blocks, ValueType* out_blocks) +{ + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + transpose_jacobi(blocks, storage_scheme, + block_ptrs, num_blocks, + out_blocks, item_ct1); + }); +} + + +template +void adaptive_transpose_jacobi( + const ValueType* __restrict__ blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const precision_reduction* __restrict__ block_precisions, + const IndexType* __restrict__ block_ptrs, size_type num_blocks, + ValueType* __restrict__ out_blocks, sycl::nd_item<3> item_ct1) +{ + const auto block_id = + thread::get_subwarp_id(item_ct1); + const auto subwarp = group::tiled_partition( + group::this_thread_block(item_ct1)); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + + const auto block_stride = storage_scheme.get_stride(); + const auto rank = subwarp.thread_rank(); + if (rank < block_size) { + /* + DPCT1007:6: Migration of this CUDA API is not supported by the Intel(R) + DPC++ Compatibility Tool. + */ + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, block_precisions[block_id], + auto local_block = + reinterpret_cast( + blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id); + auto local_out_block = + reinterpret_cast( + out_blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id); + for (IndexType i = 0; i < block_size; ++i) { + auto val = local_block[i * block_stride + rank]; + local_out_block[i + rank * block_stride] = + conjugate ? conj(val) : val; + }); + } +} + +template +void adaptive_transpose_jacobi( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + const ValueType* blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const precision_reduction* block_precisions, const IndexType* block_ptrs, + size_type num_blocks, ValueType* out_blocks) +{ + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + adaptive_transpose_jacobi( + blocks, storage_scheme, block_precisions, block_ptrs, + num_blocks, out_blocks, item_ct1); + }); +} template -inline void swap_rows(IndexType row1, IndexType row2, IndexType block_size, - ValueType* block, size_type stride) GKO_NOT_IMPLEMENTED; +size_type find_natural_blocks(std::shared_ptr exec, + const matrix::Csr* mtx, + int32 max_block_size, + IndexType* __restrict__ block_ptrs) +{ + Array nums(exec, 1); + + Array matching_next_row(exec, mtx->get_size()[0] - 1); + + const dim3 block_size(config::warp_size, 1, 1); + const dim3 grid_size( + ceildiv(mtx->get_size()[0] * config::warp_size, block_size.x), 1, 1); + compare_adjacent_rows(grid_size, block_size, 0, exec->get_queue(), + mtx->get_size()[0], max_block_size, + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + matching_next_row.get_data()); + generate_natural_block_pointer( + 1, 1, 0, exec->get_queue(), mtx->get_size()[0], max_block_size, + matching_next_row.get_const_data(), block_ptrs, nums.get_data()); + nums.set_executor(exec->get_master()); + return nums.get_const_data()[0]; +} -template -inline bool apply_gauss_jordan_transform(IndexType row, IndexType col, - IndexType block_size, ValueType* block, - size_type stride) GKO_NOT_IMPLEMENTED; - - -template > -inline void transpose_block( - IndexType block_size, const SourceValueType* from, size_type from_stride, - ResultValueType* to, size_type to_stride, - ValueConverter converter = {}) noexcept GKO_NOT_IMPLEMENTED; - - -template > -inline void conj_transpose_block( - IndexType block_size, const SourceValueType* from, size_type from_stride, - ResultValueType* to, size_type to_stride, - ValueConverter converter = {}) noexcept GKO_NOT_IMPLEMENTED; - - -template > -inline void permute_and_transpose_block( - IndexType block_size, const IndexType* col_perm, - const SourceValueType* source, size_type source_stride, - ResultValueType* result, size_type result_stride, - ValueConverter converter = {}) GKO_NOT_IMPLEMENTED; +template +inline size_type agglomerate_supervariables( + std::shared_ptr exec, int32 max_block_size, + size_type num_natural_blocks, IndexType* block_ptrs) +{ + Array nums(exec, 1); + agglomerate_supervariables_kernel(1, 1, 0, exec->get_queue(), + max_block_size, num_natural_blocks, + block_ptrs, nums.get_data()); -template -inline bool invert_block(IndexType block_size, IndexType* perm, - ValueType* block, - size_type stride) GKO_NOT_IMPLEMENTED; + nums.set_executor(exec->get_master()); + return nums.get_const_data()[0]; +} -template -inline bool validate_precision_reduction_feasibility( - std::shared_ptr exec, IndexType block_size, - const ValueType* block, size_type stride) GKO_NOT_IMPLEMENTED; +} // namespace -} // namespace +void initialize_precisions(std::shared_ptr exec, + const Array& source, + Array& precisions) +{ + const auto block_size = default_num_warps * config::warp_size; + const auto grid_size = min( + default_grid_size, + static_cast(ceildiv(precisions.get_num_elems(), block_size))); + duplicate_array( + grid_size, block_size, 0, exec->get_queue(), source.get_const_data(), + source.get_num_elems(), precisions.get_data(), + precisions.get_num_elems()); +} template -void generate(std::shared_ptr exec, - const matrix::Csr* system_matrix, - size_type num_blocks, uint32 max_block_size, - remove_complex accuracy, - const preconditioner::block_interleaved_storage_scheme& - storage_scheme, - array>& conditioning, - array& block_precisions, - const array& block_pointers, - array& blocks) GKO_NOT_IMPLEMENTED; +void find_blocks(std::shared_ptr exec, + const matrix::Csr* system_matrix, + uint32 max_block_size, size_type& num_blocks, + Array& block_pointers) +{ + auto num_natural_blocks = find_natural_blocks( + exec, system_matrix, max_block_size, block_pointers.get_data()); + num_blocks = agglomerate_supervariables( + exec, max_block_size, num_natural_blocks, block_pointers.get_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_JACOBI_GENERATE_KERNEL); + GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL); namespace { -template < - typename ValueType, typename BlockValueType, - typename ValueConverter = default_converter> -inline void apply_block(size_type block_size, size_type num_rhs, - const BlockValueType* block, size_type stride, - ValueType alpha, const ValueType* b, size_type stride_b, - ValueType beta, ValueType* x, size_type stride_x, - ValueConverter converter = {}) GKO_NOT_IMPLEMENTED; - - -} // namespace - - -template -void apply(std::shared_ptr exec, size_type num_blocks, - uint32 max_block_size, - const preconditioner::block_interleaved_storage_scheme& - storage_scheme, - const array& block_precisions, - const array& block_pointers, - const array& blocks, - const matrix::Dense* alpha, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* x) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL); - - -template -void simple_apply( - std::shared_ptr exec, size_type num_blocks, - uint32 max_block_size, +template +void transpose_jacobi( + syn::value_list, size_type num_blocks, + const precision_reduction* block_precisions, + const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& storage_scheme, - const array& block_precisions, - const array& block_pointers, const array& blocks, - const matrix::Dense* b, - matrix::Dense* x) GKO_NOT_IMPLEMENTED; + ValueType* out_blocks) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + adaptive_transpose_jacobi( + grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme, + block_precisions, block_pointers, num_blocks, out_blocks); + } else { + transpose_jacobi( + grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme, + block_pointers, num_blocks, out_blocks); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_jacobi, transpose_jacobi); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); + +} // namespace template @@ -248,7 +476,18 @@ void transpose_jacobi( const array& block_pointers, const array& blocks, const preconditioner::block_interleaved_storage_scheme& storage_scheme, - array& out_blocks) GKO_NOT_IMPLEMENTED; + Array& out_blocks) +{ + select_transpose_jacobi( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), + syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + block_pointers.get_const_data(), blocks.get_const_data(), + storage_scheme, out_blocks.get_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL); @@ -261,7 +500,18 @@ void conj_transpose_jacobi( const array& block_pointers, const array& blocks, const preconditioner::block_interleaved_storage_scheme& storage_scheme, - array& out_blocks) GKO_NOT_IMPLEMENTED; + Array& out_blocks) +{ + select_transpose_jacobi( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), + syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + block_pointers.get_const_data(), blocks.get_const_data(), + storage_scheme, out_blocks.get_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL); @@ -269,9 +519,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_dense( - std::shared_ptr exec, size_type num_blocks, - const array& block_precisions, - const array& block_pointers, const array& blocks, + std::shared_ptr exec, size_type num_blocks, + const Array& block_precisions, + const Array& block_pointers, const Array& blocks, const preconditioner::block_interleaved_storage_scheme& storage_scheme, ValueType* result_values, size_type result_stride) GKO_NOT_IMPLEMENTED; diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp new file mode 100644 index 00000000000..9edeb6f552c --- /dev/null +++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp @@ -0,0 +1,229 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include + + +#include + + +#include "core/base/extended_float.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/math.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/components/warp_blas.dp.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +namespace kernel { + + +template +void apply( + const ValueType* __restrict__ blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const IndexType* __restrict__ block_ptrs, size_type num_blocks, + const ValueType* __restrict__ b, int32 b_stride, ValueType* __restrict__ x, + int32 x_stride, sycl::nd_item<3> item_ct1) +{ + const auto block_id = + thread::get_subwarp_id(item_ct1); + const auto subwarp = group::tiled_partition( + group::this_thread_block(item_ct1)); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + ValueType v = zero(); + if (subwarp.thread_rank() < block_size) { + v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; + } + multiply_vec( + subwarp, block_size, v, + blocks + storage_scheme.get_global_block_offset(block_id) + + subwarp.thread_rank(), + storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, + x_stride, + [](ValueType& result, const ValueType& out) { result = out; }); +} + +template +void apply( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + const ValueType* blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const IndexType* block_ptrs, size_type num_blocks, const ValueType* b, + int32 b_stride, ValueType* x, int32 x_stride) +{ + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + apply( + blocks, storage_scheme, block_ptrs, num_blocks, b, b_stride, x, + x_stride, item_ct1); + }); +} + + +template +void adaptive_apply( + const ValueType* __restrict__ blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const precision_reduction* __restrict__ block_precisions, + const IndexType* __restrict__ block_ptrs, size_type num_blocks, + const ValueType* __restrict__ b, int32 b_stride, ValueType* __restrict__ x, + int32 x_stride, sycl::nd_item<3> item_ct1) +{ + const auto block_id = + thread::get_subwarp_id(item_ct1); + const auto subwarp = group::tiled_partition( + group::this_thread_block(item_ct1)); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + ValueType v = zero(); + if (subwarp.thread_rank() < block_size) { + v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; + } + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, block_precisions[block_id], + multiply_vec( + subwarp, block_size, v, + reinterpret_cast( + blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id) + + subwarp.thread_rank(), + storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, + x_stride, + [](ValueType& result, const ValueType& out) { result = out; })); +} + +template +void adaptive_apply( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + const ValueType* blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const precision_reduction* block_precisions, const IndexType* block_ptrs, + size_type num_blocks, const ValueType* b, int32 b_stride, ValueType* x, + int32 x_stride) +{ + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + adaptive_apply( + blocks, storage_scheme, block_precisions, block_ptrs, + num_blocks, b, b_stride, x, x_stride, item_ct1); + }); +} + + +} // namespace kernel + + +// clang-format off +//#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_JACOBI_BLOCK_SIZE +#define GKO_JACOBI_BLOCK_SIZE 1 +#endif + + +template +void apply(syn::value_list, size_type num_blocks, + const precision_reduction* block_precisions, + const IndexType* block_pointers, const ValueType* blocks, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + const ValueType* b, size_type b_stride, ValueType* x, + size_type x_stride) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + kernel::adaptive_apply( + grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme, + block_precisions, block_pointers, num_blocks, b, b_stride, x, + x_stride); + } else { + kernel::apply( + grid_size, block_size, 0, exec->get_queue(), blocks, storage_scheme, + block_pointers, num_blocks, b, b_stride, x, x_stride); + } +} + + +#define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType) \ + void apply( \ + syn::value_list, size_type, \ + const precision_reduction*, const IndexType*, const ValueType*, \ + const preconditioner::block_interleaved_storage_scheme&, \ + const ValueType*, size_type, ValueType*, size_type) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION); + + +} // namespace jacobi +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp new file mode 100644 index 00000000000..d5ee2fbf92a --- /dev/null +++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp @@ -0,0 +1,102 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include + + +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +template +void apply(syn::value_list, size_type num_blocks, + const precision_reduction* block_precisions, + const IndexType* block_pointers, const ValueType* blocks, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + const ValueType* b, size_type b_stride, ValueType* x, + size_type x_stride); + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_apply, apply); + + +template +void simple_apply( + std::shared_ptr exec, size_type num_blocks, + uint32 max_block_size, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + const Array& block_precisions, + const Array& block_pointers, const Array& blocks, + const matrix::Dense* b, matrix::Dense* x) +{ + // TODO: write a special kernel for multiple RHS + for (size_type col = 0; col < b->get_size()[1]; ++col) { + select_apply( + compiled_kernels(), + [&](int compiled_block_size) { + return max_block_size <= compiled_block_size; + }, + syn::value_list(), + syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + block_pointers.get_const_data(), blocks.get_const_data(), + storage_scheme, b->get_const_values() + col, b->get_stride(), + x->get_values() + col, x->get_stride()); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); + + +} // namespace jacobi +} // namespace dpcpp +} // namespace kernels +} // namespace gko From e3ea1faef5d51a52a434278ef683c1e78c1176a3 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 24 Nov 2021 18:02:36 +0800 Subject: [PATCH 02/11] script port --- .../diagonal_block_manipulation.dp.hpp | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 dpcpp/components/diagonal_block_manipulation.dp.hpp diff --git a/dpcpp/components/diagonal_block_manipulation.dp.hpp b/dpcpp/components/diagonal_block_manipulation.dp.hpp new file mode 100644 index 00000000000..32e3e26739c --- /dev/null +++ b/dpcpp/components/diagonal_block_manipulation.dp.hpp @@ -0,0 +1,128 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_DP_HPP_ + + +#include + + +#include + + +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace csr { + + +/** + * @internal + * + * @note assumes that block dimensions are in "standard format": + * (subwarp_size, config::warp_size / subwarp_size, z) + */ +template < + int max_block_size, int warps_per_block, typename Group, typename ValueType, + typename IndexType, + typename = std::enable_if_t::value>> +__dpct_inline__ void extract_transposed_diag_blocks( + const Group& group, int processed_blocks, + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, + const ValueType* __restrict__ values, + const IndexType* __restrict__ block_ptrs, size_type num_blocks, + ValueType* __restrict__ block_row, int increment, + ValueType* __restrict__ workspace, sycl::nd_item<3> item_ct1) +{ + const int tid = + item_ct1.get_local_id(1) * item_ct1.get_local_range().get(2) + + item_ct1.get_local_id(2); + const auto warp = group::tiled_partition(group); + auto bid = static_cast(item_ct1.get_group(2)) * warps_per_block * + processed_blocks + + item_ct1.get_local_id(0) * processed_blocks; + auto bstart = (bid < num_blocks) ? block_ptrs[bid] : zero(); + IndexType bsize = 0; +#pragma unroll + for (int b = 0; b < processed_blocks; ++b, ++bid) { + if (bid < num_blocks) { + bstart += bsize; + bsize = block_ptrs[bid + 1] - bstart; +#pragma unroll + for (int i = 0; i < max_block_size; ++i) { + if (i < bsize) { + if (item_ct1.get_local_id(1) == b && + item_ct1.get_local_id(2) < max_block_size) { + workspace[item_ct1.get_local_id(2)] = zero(); + } + warp.sync(); + const auto row = bstart + i; + const auto rstart = row_ptrs[row] + tid; + const auto rend = row_ptrs[row + 1]; + // use the entire warp to ensure coalesced memory access + for (auto j = rstart; j < rend; j += config::warp_size) { + const auto col = col_idxs[j] - bstart; + if (col >= bsize) { + break; + } + if (col >= 0) { + workspace[col] = values[j]; + } + } + warp.sync(); + if (item_ct1.get_local_id(1) == b && + item_ct1.get_local_id(2) < bsize) { + block_row[i * increment] = + workspace[item_ct1.get_local_id(2)]; + } + warp.sync(); + } + } + } + } +} + + +} // namespace csr +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_DP_HPP_ From abc1eeb38bf14c73849431da48a1592246d62506 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 24 Nov 2021 23:02:05 +0800 Subject: [PATCH 03/11] test script port --- dpcpp/test/CMakeLists.txt | 1 + dpcpp/test/preconditioner/CMakeLists.txt | 1 + dpcpp/test/preconditioner/jacobi_kernels.cpp | 900 +++++++++++++++++++ 3 files changed, 902 insertions(+) create mode 100644 dpcpp/test/preconditioner/CMakeLists.txt create mode 100644 dpcpp/test/preconditioner/jacobi_kernels.cpp diff --git a/dpcpp/test/CMakeLists.txt b/dpcpp/test/CMakeLists.txt index 00be1e71e35..b882f57715e 100644 --- a/dpcpp/test/CMakeLists.txt +++ b/dpcpp/test/CMakeLists.txt @@ -4,3 +4,4 @@ set(GINKGO_COMPILING_DPCPP_TEST ON) add_subdirectory(base) add_subdirectory(components) add_subdirectory(matrix) +add_subdirectory(preconditioner) diff --git a/dpcpp/test/preconditioner/CMakeLists.txt b/dpcpp/test/preconditioner/CMakeLists.txt new file mode 100644 index 00000000000..a0ca5a2e38a --- /dev/null +++ b/dpcpp/test/preconditioner/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_create_test(jacobi_kernels) diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp new file mode 100644 index 00000000000..0302ae7c24e --- /dev/null +++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp @@ -0,0 +1,900 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include + + +#include "core/test/utils.hpp" +#include "core/test/utils/unsort_matrix.hpp" + + +namespace { + + +class Jacobi : public ::testing::Test { +protected: + using Bj = gko::preconditioner::Jacobi<>; + using Mtx = gko::matrix::Csr<>; + using Vec = gko::matrix::Dense<>; + using mtx_data = gko::matrix_data<>; + + void SetUp() + { + ASSERT_GT(gko::DpcppExecutor::get_num_devices(), 0); + ref = gko::ReferenceExecutor::create(); + dpcpp = gko::DpcppExecutor::create(0, ref); + } + + void TearDown() + { + if (dpcpp != nullptr) { + ASSERT_NO_THROW(dpcpp->synchronize()); + } + } + + void initialize_data( + std::initializer_list block_pointers, + std::initializer_list block_precisions, + std::initializer_list condition_numbers, + gko::uint32 max_block_size, int min_nnz, int max_nnz, int num_rhs = 1, + double accuracy = 0.1, bool skip_sorting = true) + { + std::ranlux48 engine(42); + const auto dim = *(end(block_pointers) - 1); + if (condition_numbers.size() == 0) { + mtx = gko::test::generate_random_matrix( + dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz), + std::normal_distribution<>(0.0, 1.0), engine, ref); + } else { + std::vector blocks; + for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) { + const auto size = + begin(block_pointers)[i + 1] - begin(block_pointers)[i]; + const auto cond = begin(condition_numbers)[i]; + blocks.push_back(mtx_data::cond( + size, cond, std::normal_distribution<>(-1, 1), engine)); + } + mtx = Mtx::create(ref); + mtx->read(mtx_data::diag(begin(blocks), end(blocks))); + } + gko::Array block_ptrs(ref, block_pointers); + gko::Array block_prec(ref, block_precisions); + if (block_prec.get_num_elems() == 0) { + bj_factory = Bj::build() + .with_max_block_size(max_block_size) + .with_block_pointers(block_ptrs) + .with_skip_sorting(skip_sorting) + .on(ref); + d_bj_factory = Bj::build() + .with_max_block_size(max_block_size) + .with_block_pointers(block_ptrs) + .with_skip_sorting(skip_sorting) + .on(dpcpp); + } else { + bj_factory = Bj::build() + .with_max_block_size(max_block_size) + .with_block_pointers(block_ptrs) + .with_storage_optimization(block_prec) + .with_accuracy(accuracy) + .with_skip_sorting(skip_sorting) + .on(ref); + d_bj_factory = Bj::build() + .with_max_block_size(max_block_size) + .with_block_pointers(block_ptrs) + .with_storage_optimization(block_prec) + .with_accuracy(accuracy) + .with_skip_sorting(skip_sorting) + .on(dpcpp); + } + b = gko::test::generate_random_matrix( + dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), + std::normal_distribution<>(0.0, 1.0), engine, ref); + d_b = gko::clone(dpcpp, b); + x = gko::test::generate_random_matrix( + dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), + std::normal_distribution<>(0.0, 1.0), engine, ref); + d_x = gko::clone(dpcpp, x); + } + + const gko::precision_reduction dp{}; + const gko::precision_reduction sp{0, 1}; + const gko::precision_reduction hp{0, 2}; + const gko::precision_reduction tp{1, 0}; + const gko::precision_reduction qp{2, 0}; + const gko::precision_reduction up{1, 1}; + const gko::precision_reduction ap{gko::precision_reduction::autodetect()}; + + std::shared_ptr ref; + std::shared_ptr dpcpp; + std::shared_ptr mtx; + std::unique_ptr x; + std::unique_ptr b; + std::unique_ptr d_x; + std::unique_ptr d_b; + + std::unique_ptr bj_factory; + std::unique_ptr d_bj_factory; +}; + + +TEST_F(Jacobi, DpcppFindNaturalBlocksEquivalentToRef) +{ + /* example matrix: + 1 1 + 1 1 + 1 1 + 1 1 + */ + auto mtx = share(Mtx::create(ref)); + mtx->read({{4, 4}, + {{0, 0, 1.0}, + {0, 1, 1.0}, + {1, 0, 1.0}, + {1, 1, 1.0}, + {2, 0, 1.0}, + {2, 2, 1.0}, + {3, 0, 1.0}, + {3, 2, 1.0}}}); + + auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx); + auto d_bj = Bj::build().with_max_block_size(3u).on(dpcpp)->generate(mtx); + + ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks()); + // TODO: actually check if the results are the same +} + + +TEST_F(Jacobi, DpcppExecutesSupervariableAgglomerationEquivalentToRef) +{ + /* example matrix: + 1 1 + 1 1 + 1 1 + 1 1 + 1 + */ + auto mtx = share(Mtx::create(ref)); + mtx->read({{5, 5}, + {{0, 0, 1.0}, + {0, 1, 1.0}, + {1, 0, 1.0}, + {1, 1, 1.0}, + {2, 2, 1.0}, + {2, 3, 1.0}, + {3, 2, 1.0}, + {3, 3, 1.0}, + {4, 4, 1.0}}}); + + auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx); + auto d_bj = Bj::build().with_max_block_size(3u).on(dpcpp)->generate(mtx); + + ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks()); + // TODO: actually check if the results are the same +} + + +TEST_F(Jacobi, DpcppFindNaturalBlocksInLargeMatrixEquivalentToRef) +{ + /* example matrix: + 1 1 + 1 1 + 1 1 + 1 1 + 1 1 + 1 1 + */ + using data = gko::matrix_data; + auto mtx = share(Mtx::create(ref)); + mtx->read(data::diag({550, 550}, {{1.0, 1.0, 0.0, 0.0, 0.0, 0.0}, + {1.0, 1.0, 0.0, 0.0, 0.0, 0.0}, + {1.0, 0.0, 1.0, 0.0, 0.0, 0.0}, + {1.0, 0.0, 1.0, 0.0, 0.0, 0.0}, + {1.0, 0.0, 1.0, 0.0, 0.0, 0.0}, + {1.0, 0.0, 1.0, 0.0, 0.0, 0.0}})); + + auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx); + auto d_bj = Bj::build().with_max_block_size(3u).on(dpcpp)->generate(mtx); + + ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks()); + // TODO: actually check if the results are the same +} + + +TEST_F(Jacobi, + DpcppExecutesSupervariableAgglomerationInLargeMatrixEquivalentToRef) +{ + /* example matrix: + 1 1 + 1 1 + 1 1 + 1 1 + 1 + */ + using data = gko::matrix_data; + auto mtx = share(Mtx::create(ref)); + mtx->read(data::diag({550, 550}, {{1.0, 1.0, 0.0, 0.0, 0.0}, + {1.0, 1.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 1.0, 1.0, 0.0}, + {0.0, 0.0, 1.0, 1.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 1.0}})); + + auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx); + auto d_bj = Bj::build().with_max_block_size(3u).on(dpcpp)->generate(mtx); + + ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks()); + // TODO: actually check if the results are the same +} + + +TEST_F(Jacobi, + DpcppExecutesSupervarAgglomerationEquivalentToRefFor150NonzerowsPerRow) +{ + /* example matrix duplicated 50 times: + 1 1 1 + 1 1 1 + 1 1 1 + 1 1 1 + 1 1 + */ + using data = gko::matrix_data; + auto mtx = share(Mtx::create(ref)); + mtx->read({{50, 50}, + {{1.0, 1.0, 0.0, 1.0, 0.0}, + {1.0, 1.0, 0.0, 1.0, 0.0}, + {1.0, 0.0, 1.0, 1.0, 0.0}, + {1.0, 0.0, 1.0, 1.0, 0.0}, + {0.0, 0.0, 1.0, 0.0, 1.0}}}); + + + auto bj = Bj::build().with_max_block_size(3u).on(ref)->generate(mtx); + auto d_bj = Bj::build().with_max_block_size(3u).on(dpcpp)->generate(mtx); + + ASSERT_EQ(d_bj->get_num_blocks(), bj->get_num_blocks()); + // TODO: actually check if the results are the same +} + + +TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Sorted) +{ + initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 110); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); +} + + +TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Unsorted) +{ + std::ranlux48 engine(42); + initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 110, 1, 0.1, false); + gko::test::unsort_matrix(mtx.get(), engine); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); +} + + +TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithDifferentBlockSize) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32, + 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); +} + + +TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithMPW) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); +} + + +TEST_F(Jacobi, DpcppTransposedPreconditionerEquivalentToRefWithMPW) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->transpose()), + gko::as(bj->transpose()), 1e-14); +} + + +TEST_F(Jacobi, DpcppConjTransposedPreconditionerEquivalentToRefWithMPW) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->conj_transpose()), + gko::as(bj->conj_transpose()), 1e-14); +} + + +TEST_F(Jacobi, DpcppApplyEquivalentToRefWithBlockSize32) +{ + initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 111); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, DpcppApplyEquivalentToRefWithDifferentBlockSize) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32, + 97, 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, DpcppApplyEquivalentToRef) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) +{ + gko::size_type dim = 313; + std::ranlux48 engine(42); + auto dense_smtx = gko::share(gko::test::generate_random_matrix( + dim, dim, std::uniform_int_distribution<>(1, dim), + std::normal_distribution<>(1.0, 2.0), engine, ref)); + gko::test::make_diag_dominant(dense_smtx.get()); + auto smtx = gko::share(Mtx::create(ref)); + smtx->copy_from(dense_smtx.get()); + auto sb = gko::share(gko::test::generate_random_matrix( + dim, 3, std::uniform_int_distribution<>(1, 1), + std::normal_distribution<>(0.0, 1.0), engine, ref)); + auto sx = Vec::create(ref, sb->get_size()); + + auto d_smtx = gko::share(Mtx::create(dpcpp)); + auto d_sb = gko::share(Vec::create(dpcpp)); + auto d_sx = gko::share(Vec::create(dpcpp, sb->get_size())); + d_smtx->copy_from(smtx.get()); + d_sb->copy_from(sb.get()); + + auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx); + auto d_sj = Bj::build().with_max_block_size(1u).on(dpcpp)->generate(d_smtx); + + sj->apply(sb.get(), sx.get()); + d_sj->apply(d_sb.get(), d_sx.get()); + + GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12); +} + + +TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRef) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99); + auto alpha = gko::initialize({2.0}, ref); + auto d_alpha = gko::initialize({2.0}, dpcpp); + auto beta = gko::initialize({-1.0}, ref); + auto d_beta = gko::initialize({-1.0}, dpcpp); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(alpha.get(), b.get(), beta.get(), x.get()); + d_bj->apply(d_alpha.get(), d_b.get(), d_beta.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) +{ + gko::size_type dim = 313; + std::ranlux48 engine(42); + auto dense_smtx = gko::share(gko::test::generate_random_matrix( + dim, dim, std::uniform_int_distribution<>(1, dim), + std::normal_distribution<>(1.0, 2.0), engine, ref)); + gko::test::make_diag_dominant(dense_smtx.get()); + auto smtx = gko::share(Mtx::create(ref)); + smtx->copy_from(dense_smtx.get()); + auto sb = gko::share(gko::test::generate_random_matrix( + dim, 3, std::uniform_int_distribution<>(1, 1), + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); + auto sx = gko::share(gko::test::generate_random_matrix( + dim, 3, std::uniform_int_distribution<>(1, 1), + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); + + auto d_smtx = gko::share(gko::clone(dpcpp, smtx)); + auto d_sb = gko::share(gko::clone(dpcpp, sb)); + auto d_sx = gko::share(gko::clone(dpcpp, sx)); + auto alpha = gko::initialize({2.0}, ref); + auto d_alpha = gko::initialize({2.0}, dpcpp); + auto beta = gko::initialize({-1.0}, ref); + auto d_beta = gko::initialize({-1.0}, dpcpp); + + auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx); + auto d_sj = Bj::build().with_max_block_size(1u).on(dpcpp)->generate(d_smtx); + + sj->apply(alpha.get(), sb.get(), beta.get(), sx.get()); + d_sj->apply(d_alpha.get(), d_sb.get(), d_beta.get(), d_sx.get()); + + GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12); +} + + +TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRef) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99, 5); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, DpcppLinearCombinationApplyToMultipleVectorsEquivalentToRef) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + 97, 99, 5); + auto alpha = gko::initialize({2.0}, ref); + auto d_alpha = gko::initialize({2.0}, dpcpp); + auto beta = gko::initialize({-1.0}, ref); + auto d_beta = gko::initialize({-1.0}, dpcpp); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(alpha.get(), b.get(), beta.get(), x.get()); + d_bj->apply(d_alpha.get(), d_b.get(), d_beta.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, ComputesTheSameConditionNumberAsRef) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = clone(ref, d_bj_factory->generate(mtx)); + + for (int i = 0; i < gko::as(bj.get())->get_num_blocks(); ++i) { + EXPECT_NEAR(bj->get_conditioning()[i], d_bj->get_conditioning()[i], + 1e-9); + } +} + + +TEST_F(Jacobi, SelectsTheSamePrecisionsAsRef) +{ + initialize_data( + {0, 2, 14, 27, 40, 51, 61, 70, 80, 92, 100}, + {ap, ap, ap, ap, ap, ap, ap, ap, ap, ap}, + {1e+0, 1e+0, 1e+2, 1e+3, 1e+4, 1e+4, 1e+6, 1e+7, 1e+8, 1e+9}, 13, 97, + 99, 1, 0.2); + + auto bj = bj_factory->generate(mtx); + auto d_bj = gko::clone(ref, d_bj_factory->generate(mtx)); + + auto bj_prec = + bj->get_parameters().storage_optimization.block_wise.get_const_data(); + auto d_bj_prec = + d_bj->get_parameters().storage_optimization.block_wise.get_const_data(); + for (int i = 0; i < gko::as(bj.get())->get_num_blocks(); ++i) { + EXPECT_EQ(bj_prec[i], d_bj_prec[i]); + } +} + + +TEST_F(Jacobi, AvoidsPrecisionsThatOverflow) +{ + auto mtx = gko::matrix::Csr<>::create(dpcpp); + // clang-format off + mtx->read(mtx_data::diag({ + // perfectly conditioned block, small value difference, + // can use fp16 (5, 10) + {{2.0, 1.0}, + {1.0, 2.0}}, + // perfectly conditioned block (scaled orthogonal), + // with large value difference, need fp16 (7, 8) + {{1e-8, -1e-16}, + {1e-16, 1e-8}} + })); + // clang-format on + + auto bj = + Bj::build() + .with_max_block_size(13u) + .with_block_pointers(gko::Array(dpcpp, {0, 2, 4})) + .with_storage_optimization(gko::precision_reduction::autodetect()) + .with_accuracy(0.1) + .on(dpcpp) + ->generate(give(mtx)); + + // both blocks are in the same group, both need (7, 8) + auto h_bj = clone(ref, bj); + auto prec = + h_bj->get_parameters().storage_optimization.block_wise.get_const_data(); + EXPECT_EQ(prec[0], gko::precision_reduction(1, 1)); + ASSERT_EQ(prec[1], gko::precision_reduction(1, 1)); +} + + +TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithFullPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-13); +} + + +TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithReducedPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-7); +} + + +TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithCustomReducedPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-6); +} + + +TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithQuarteredPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-3); +} + + +TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithCustomQuarteredPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-1); +} + + +TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-1); +} + + +TEST_F(Jacobi, + DpcppTransposedPreconditionerEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->transpose()), + gko::as(bj->transpose()), 1e-14); +} + + +TEST_F(Jacobi, + DpcppConjTransposedPreconditionerEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99); + + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + d_bj->copy_from(bj.get()); + + GKO_ASSERT_MTX_NEAR(gko::as(d_bj->conj_transpose()), + gko::as(bj->conj_transpose()), 1e-14); +} + + +TEST_F(Jacobi, DpcppApplyEquivalentToRefWithFullPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, DpcppApplyEquivalentToRefWithReducedPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6); +} + + +TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-5); +} + + +TEST_F(Jacobi, DpcppApplyEquivalentToRefWithQuarteredPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-2); +} + + +TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedAndReducedPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {up, up, up, up, up, up, up, up, up, up, up}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-2); +} + + +TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomQuarteredPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6); +} + + +TEST_F(Jacobi, DpcppApplyEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-1); +} + + +TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97, + 99); + auto alpha = gko::initialize({2.0}, ref); + auto d_alpha = gko::initialize({2.0}, dpcpp); + auto beta = gko::initialize({-1.0}, ref); + auto d_beta = gko::initialize({-1.0}, dpcpp); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6); +} + + +TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithFullPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, + 99, 5); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithReducedPrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97, + 99, 5); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6); +} + + +TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + 99, 5); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-1); +} + + +TEST_F( + Jacobi, + DpcppLinearCombinationApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision) +{ + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, + {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97, + 99, 5); + auto alpha = gko::initialize({2.0}, ref); + auto d_alpha = gko::initialize({2.0}, dpcpp); + auto beta = gko::initialize({-1.0}, ref); + auto d_beta = gko::initialize({-1.0}, dpcpp); + auto bj = bj_factory->generate(mtx); + auto d_bj = d_bj_factory->generate(mtx); + + bj->apply(b.get(), x.get()); + d_bj->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6); +} + + +} // namespace From 8f25a9af2921667b97d45afd72b2adb3937f6743 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 25 Nov 2021 17:27:30 +0800 Subject: [PATCH 04/11] manual fix and adjust the error bound --- .../jacobi_generate_kernel.hpp.inc | 4 +- core/preconditioner/jacobi.cpp | 3 - core/preconditioner/jacobi_utils.hpp | 20 ++-- dpcpp/CMakeLists.txt | 34 ++++++ ...cobi_advanced_apply_instantiate.inc.dp.cpp | 10 +- .../jacobi_advanced_apply_kernel.dp.cpp | 9 +- .../jacobi_generate_instantiate.inc.dp.cpp | 18 +-- .../jacobi_generate_kernel.dp.cpp | 3 +- dpcpp/preconditioner/jacobi_kernels.dp.cpp | 13 +- ...jacobi_simple_apply_instantiate.inc.dp.cpp | 8 +- .../jacobi_simple_apply_kernel.dp.cpp | 12 +- dpcpp/test/preconditioner/jacobi_kernels.cpp | 112 +++++++++++------- omp/preconditioner/jacobi_kernels.cpp | 4 +- reference/preconditioner/jacobi_kernels.cpp | 4 +- 14 files changed, 160 insertions(+), 94 deletions(-) diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc index d48fc65fa11..208bd33d7dc 100644 --- a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc +++ b/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc @@ -73,7 +73,9 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility( } return succeeded && block_cond >= 1.0 && - block_cond * float_traits>::eps < 1e-3; + block_cond * static_cast>( + float_traits>::eps) < + remove_complex{1e-3}; } diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index 1125e71d22e..4309edede0c 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -333,11 +333,9 @@ void Jacobi::generate(const LinOp* system_matrix, } else { auto csr_mtx = convert_to_with_sorting(exec, system_matrix, skip_sorting); - if (parameters_.block_pointers.get_data() == nullptr) { this->detect_blocks(csr_mtx.get()); } - const auto all_block_opt = parameters_.storage_optimization.of_all_blocks; auto& precisions = parameters_.storage_optimization.block_wise; @@ -355,7 +353,6 @@ void Jacobi::generate(const LinOp* system_matrix, precisions = std::move(tmp); conditioning_.resize_and_reset(num_blocks_); } - exec->run(jacobi::make_generate( csr_mtx.get(), num_blocks_, parameters_.max_block_size, parameters_.accuracy, storage_scheme_, conditioning_, precisions, diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp index 738defdf54c..302ec84f4ee 100644 --- a/core/preconditioner/jacobi_utils.hpp +++ b/core/preconditioner/jacobi_utils.hpp @@ -89,18 +89,12 @@ struct precision_reduction_descriptor { static constexpr GKO_ATTRIBUTES uint32 singleton(const precision_reduction& pr) { - return pr == precision_reduction(0, 0) - ? p0n0 - : pr == precision_reduction(0, 1) - ? p0n1 - : pr == precision_reduction(0, 2) - ? p0n2 - : pr == precision_reduction(1, 0) - ? p1n0 - : pr == precision_reduction(1, 1) - ? p1n1 - : pr == precision_reduction(2, 0) - ? p2n0 + return pr == precision_reduction(0, 0) ? p0n0 + : pr == precision_reduction(0, 1) ? p0n1 + : pr == precision_reduction(0, 2) ? p0n2 + : pr == precision_reduction(1, 0) ? p1n0 + : pr == precision_reduction(1, 1) ? p1n1 + : pr == precision_reduction(2, 0) ? p2n0 : p0n0; } }; @@ -141,7 +135,7 @@ GKO_ATTRIBUTES GKO_INLINE uint32 get_supported_storage_reductions( using gko::detail::float_traits; using type = remove_complex; using prd = precision_reduction_descriptor; - auto accurate = [&cond, &accuracy](double eps) { + auto accurate = [&cond, &accuracy](type eps) { return cond * eps < accuracy; }; uint8 is_verified1 = 2; diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 55b8dd87022..dd28a79adeb 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -41,7 +41,10 @@ target_sources(ginkgo_dpcpp matrix/sparsity_csr_kernels.dp.cpp multigrid/pgm_kernels.dp.cpp preconditioner/isai_kernels.dp.cpp + preconditioner/jacobi_advanced_apply_kernel.dp.cpp + preconditioner/jacobi_generate_kernel.dp.cpp preconditioner/jacobi_kernels.dp.cpp + preconditioner/jacobi_simple_apply_kernel.dp.cpp reorder/rcm_kernels.dp.cpp solver/gmres_kernels.dp.cpp solver/cb_gmres_kernels.dp.cpp @@ -54,6 +57,34 @@ target_sources(ginkgo_dpcpp ${GKO_UNIFIED_COMMON_SOURCES} ) +# if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) +# set(GKO_DPCPP_JACOBI_BLOCK_SIZES) +# foreach(blocksize RANGE 1 32) +# list(APPEND GKO_DPCPP_JACOBI_BLOCK_SIZES ${blocksize}) +# endforeach() +# else() + set(GKO_DPCPP_JACOBI_BLOCK_SIZES 32) +# endif() +set(GKO_DPCPP_JACOBI_SOURCES) +foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_DPCPP_JACOBI_BLOCK_SIZES) + configure_file( + preconditioner/jacobi_generate_instantiate.inc.dp.cpp + preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp) + configure_file( + preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp + preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp) + configure_file( + preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp + preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp) + list(APPEND GKO_DPCPP_JACOBI_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp) +endforeach() +target_sources(ginkgo_dpcpp PRIVATE ${GKO_DPCPP_JACOBI_SOURCES}) +string(REPLACE ";" "," GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE "${GKO_DPCPP_JACOBI_BLOCK_SIZES}") +configure_file(preconditioner/jacobi_common.hpp.in preconditioner/jacobi_common.hpp) + ginkgo_compile_features(ginkgo_dpcpp) target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0) @@ -72,6 +103,9 @@ if (BUILD_SHARED_LIBS) else () target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel) endif() +# include path for generated headers like jacobi_common.hpp +target_include_directories(ginkgo_dpcpp + PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..) target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device) target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL) if (GINKGO_DPCPP_SINGLE_MODE) diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp index 4425f8e277b..ffaef7e7650 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp @@ -63,8 +63,6 @@ namespace dpcpp { * @ingroup jacobi */ namespace jacobi { - - namespace kernel { @@ -179,7 +177,7 @@ void advanced_adaptive_apply( // clang-format off -//#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ // clang-format on // make things easier for IDEs #ifndef GKO_JACOBI_BLOCK_SIZE @@ -190,7 +188,8 @@ void advanced_adaptive_apply( template void advanced_apply( - syn::value_list, size_type num_blocks, + syn::value_list, + std::shared_ptr exec, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -221,7 +220,8 @@ void advanced_apply( #define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType) \ void advanced_apply( \ - syn::value_list, size_type, \ + syn::value_list, \ + std::shared_ptr, size_type, \ const precision_reduction*, const IndexType* block_pointers, \ const ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp index 1db62f49db3..d90342f7575 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp @@ -57,7 +57,8 @@ namespace jacobi { template void advanced_apply( - syn::value_list, size_type num_blocks, + syn::value_list, + std::shared_ptr exec, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -89,9 +90,9 @@ void apply(std::shared_ptr exec, size_type num_blocks, return max_block_size <= compiled_block_size; }, syn::value_list(), - syn::type_list<>(), num_blocks, block_precisions.get_const_data(), - block_pointers.get_const_data(), blocks.get_const_data(), - storage_scheme, alpha->get_const_values(), + syn::type_list<>(), exec, num_blocks, + block_precisions.get_const_data(), block_pointers.get_const_data(), + blocks.get_const_data(), storage_scheme, alpha->get_const_values(), b->get_const_values() + col, b->get_stride(), x->get_values() + col, x->get_stride()); } diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp index 2a759b743d8..5d27a02c65b 100644 --- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp @@ -33,7 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/jacobi_kernels.hpp" -#include #include @@ -53,6 +52,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/math.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/diagonal_block_manipulation.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/components/uninitialized_array.hpp" #include "dpcpp/components/warp_blas.dp.hpp" @@ -112,7 +112,9 @@ __dpct_inline__ bool validate_precision_reduction_feasibility( } return succeeded && block_cond >= 1.0 && - block_cond * float_traits>::eps < 1e-3; + block_cond * static_cast>( + float_traits>::eps) < + remove_complex{1e-3}; } @@ -135,7 +137,7 @@ void generate( csr::extract_transposed_diag_blocks( block, config::warp_size / subwarp_size, row_ptrs, col_idxs, values, block_ptrs, num_blocks, row, 1, - *workspace + item_ct1.get_local_id(0) * max_block_size); + *workspace + item_ct1.get_local_id(0) * max_block_size, item_ct1); const auto subwarp = group::tiled_partition(block); if (block_id < num_blocks) { const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; @@ -199,7 +201,7 @@ void adaptive_generate( csr::extract_transposed_diag_blocks( block, config::warp_size / subwarp_size, row_ptrs, col_idxs, values, block_ptrs, num_blocks, row, 1, - *workspace + item_ct1.get_local_id(0) * max_block_size); + *workspace + item_ct1.get_local_id(0) * max_block_size, item_ct1); // compute inverse and figure out the correct precision const auto subwarp = group::tiled_partition(block); @@ -251,8 +253,8 @@ void adaptive_generate( // make sure all blocks in the group have the same precision const auto warp = group::tiled_partition(block); - const auto prec = - preconditioner::detail::get_optimal_storage_reduction(reduce( + const auto prec = preconditioner::detail::get_optimal_storage_reduction( + ::gko::kernels::dpcpp::reduce( warp, prec_descriptor, [](uint32 x, uint32 y) { return x & y; })); // store the block back into memory @@ -302,7 +304,7 @@ void adaptive_generate( // clang-format off -//#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ // clang-format on // make things easier for IDEs #ifndef GKO_JACOBI_BLOCK_SIZE @@ -313,6 +315,7 @@ void adaptive_generate( template void generate(syn::value_list, + std::shared_ptr exec, const matrix::Csr* mtx, remove_complex accuracy, ValueType* block_data, const preconditioner::block_interleaved_storage_scheme& @@ -348,6 +351,7 @@ void generate(syn::value_list, void generate( \ syn::value_list, \ + std::shared_ptr, \ const matrix::Csr*, remove_complex, \ ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp index 22e3f587748..91e7768fd54 100644 --- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp @@ -58,6 +58,7 @@ namespace jacobi { template void generate(syn::value_list, + std::shared_ptr exec, const matrix::Csr* mtx, remove_complex accuracy, ValueType* block_data, const preconditioner::block_interleaved_storage_scheme& @@ -88,7 +89,7 @@ void generate(std::shared_ptr exec, return max_block_size <= compiled_block_size; }, syn::value_list(), syn::type_list<>(), - system_matrix, accuracy, blocks.get_data(), storage_scheme, + exec, system_matrix, accuracy, blocks.get_data(), storage_scheme, conditioning.get_data(), block_precisions.get_data(), block_pointers.get_const_data(), num_blocks); } diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index 71c8101a8ee..105f1c6f96a 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -65,7 +65,7 @@ namespace { // a total of 32 warps (1024 threads) -constexpr int default_num_warps = 32; +constexpr int default_num_warps = 16; // with current architectures, at most 32 warps can be scheduled per SM (and // current GPUs have at most 84 SMs) constexpr int default_grid_size = 32 * 32 * 128; @@ -266,10 +266,6 @@ void transpose_jacobi( if (rank < block_size) { for (IndexType i = 0; i < block_size; ++i) { auto val = blocks[block_ofs + i * block_stride + rank]; - /* - DPCT1007:4: Migration of this CUDA API is not supported by the - Intel(R) DPC++ Compatibility Tool. - */ out_blocks[block_ofs + i + rank * block_stride] = conjugate ? conj(val) : val; } @@ -437,7 +433,8 @@ namespace { template void transpose_jacobi( - syn::value_list, size_type num_blocks, + syn::value_list, + std::shared_ptr exec, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -484,7 +481,7 @@ void transpose_jacobi( return max_block_size <= compiled_block_size; }, syn::value_list(), - syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + syn::type_list<>(), exec, num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, out_blocks.get_data()); } @@ -508,7 +505,7 @@ void conj_transpose_jacobi( return max_block_size <= compiled_block_size; }, syn::value_list(), - syn::type_list<>(), num_blocks, block_precisions.get_const_data(), + syn::type_list<>(), exec, num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, out_blocks.get_data()); } diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp index 9edeb6f552c..8c3787cf721 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp @@ -174,7 +174,7 @@ void adaptive_apply( // clang-format off -//#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ // clang-format on // make things easier for IDEs #ifndef GKO_JACOBI_BLOCK_SIZE @@ -184,7 +184,8 @@ void adaptive_apply( template -void apply(syn::value_list, size_type num_blocks, +void apply(syn::value_list, + std::shared_ptr exec, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -214,7 +215,8 @@ void apply(syn::value_list, size_type num_blocks, #define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType) \ void apply( \ - syn::value_list, size_type, \ + syn::value_list, \ + std::shared_ptr, size_type, \ const precision_reduction*, const IndexType*, const ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ const ValueType*, size_type, ValueType*, size_type) diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp index d5ee2fbf92a..651a4ba194a 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp @@ -56,7 +56,8 @@ namespace jacobi { template -void apply(syn::value_list, size_type num_blocks, +void apply(syn::value_list, + std::shared_ptr exec, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -85,10 +86,11 @@ void simple_apply( return max_block_size <= compiled_block_size; }, syn::value_list(), - syn::type_list<>(), num_blocks, block_precisions.get_const_data(), - block_pointers.get_const_data(), blocks.get_const_data(), - storage_scheme, b->get_const_values() + col, b->get_stride(), - x->get_values() + col, x->get_stride()); + syn::type_list<>(), exec, num_blocks, + block_precisions.get_const_data(), block_pointers.get_const_data(), + blocks.get_const_data(), storage_scheme, + b->get_const_values() + col, b->get_stride(), x->get_values() + col, + x->get_stride()); } } diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp index 0302ae7c24e..e5ec9bc604c 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" +#include "dpcpp/test/utils.hpp" namespace { @@ -52,14 +53,19 @@ namespace { class Jacobi : public ::testing::Test { protected: - using Bj = gko::preconditioner::Jacobi<>; - using Mtx = gko::matrix::Csr<>; - using Vec = gko::matrix::Dense<>; - using mtx_data = gko::matrix_data<>; +#if GINKGO_DPCPP_SINGLE_MODE + using value_type = float; +#else + using value_type = double; +#endif + using Bj = gko::preconditioner::Jacobi; + using Mtx = gko::matrix::Csr; + using Vec = gko::matrix::Dense; + using mtx_data = gko::matrix_data; void SetUp() { - ASSERT_GT(gko::DpcppExecutor::get_num_devices(), 0); + ASSERT_GT(gko::DpcppExecutor::get_num_devices("all"), 0); ref = gko::ReferenceExecutor::create(); dpcpp = gko::DpcppExecutor::create(0, ref); } @@ -74,16 +80,16 @@ class Jacobi : public ::testing::Test { void initialize_data( std::initializer_list block_pointers, std::initializer_list block_precisions, - std::initializer_list condition_numbers, + std::initializer_list condition_numbers, gko::uint32 max_block_size, int min_nnz, int max_nnz, int num_rhs = 1, - double accuracy = 0.1, bool skip_sorting = true) + value_type accuracy = 0.1, bool skip_sorting = true) { std::ranlux48 engine(42); const auto dim = *(end(block_pointers) - 1); if (condition_numbers.size() == 0) { mtx = gko::test::generate_random_matrix( dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz), - std::normal_distribution<>(0.0, 1.0), engine, ref); + std::normal_distribution(0.0, 1.0), engine, ref); } else { std::vector blocks; for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) { @@ -91,7 +97,8 @@ class Jacobi : public ::testing::Test { begin(block_pointers)[i + 1] - begin(block_pointers)[i]; const auto cond = begin(condition_numbers)[i]; blocks.push_back(mtx_data::cond( - size, cond, std::normal_distribution<>(-1, 1), engine)); + size, cond, std::normal_distribution(-1, 1), + engine)); } mtx = Mtx::create(ref); mtx->read(mtx_data::diag(begin(blocks), end(blocks))); @@ -127,11 +134,11 @@ class Jacobi : public ::testing::Test { } b = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution<>(0.0, 1.0), engine, ref); + std::normal_distribution(0.0, 1.0), engine, ref); d_b = gko::clone(dpcpp, b); x = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution<>(0.0, 1.0), engine, ref); + std::normal_distribution(0.0, 1.0), engine, ref); d_x = gko::clone(dpcpp, x); } @@ -222,7 +229,7 @@ TEST_F(Jacobi, DpcppFindNaturalBlocksInLargeMatrixEquivalentToRef) 1 1 1 1 */ - using data = gko::matrix_data; + using data = gko::matrix_data; auto mtx = share(Mtx::create(ref)); mtx->read(data::diag({550, 550}, {{1.0, 1.0, 0.0, 0.0, 0.0, 0.0}, {1.0, 1.0, 0.0, 0.0, 0.0, 0.0}, @@ -249,7 +256,7 @@ TEST_F(Jacobi, 1 1 1 */ - using data = gko::matrix_data; + using data = gko::matrix_data; auto mtx = share(Mtx::create(ref)); mtx->read(data::diag({550, 550}, {{1.0, 1.0, 0.0, 0.0, 0.0}, {1.0, 1.0, 0.0, 0.0, 0.0}, @@ -275,7 +282,7 @@ TEST_F(Jacobi, 1 1 1 1 1 */ - using data = gko::matrix_data; + using data = gko::matrix_data; auto mtx = share(Mtx::create(ref)); mtx->read({{50, 50}, {{1.0, 1.0, 0.0, 1.0, 0.0}, @@ -300,7 +307,8 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Sorted) auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); - GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); + GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), + 50 * r::value); } @@ -313,7 +321,8 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Unsorted) auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); - GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); + GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), + 50 * r::value); } @@ -325,7 +334,8 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithDifferentBlockSize) auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); - GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); + GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), + 100 * r::value); } @@ -337,7 +347,8 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithMPW) auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); - GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); + GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), + 100 * r::value); } @@ -378,7 +389,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithBlockSize32) bj->apply(b.get(), x.get()); d_bj->apply(d_b.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); + GKO_ASSERT_MTX_NEAR(d_x, x, 100 * r::value); } @@ -392,7 +403,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithDifferentBlockSize) bj->apply(b.get(), x.get()); d_bj->apply(d_b.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); + GKO_ASSERT_MTX_NEAR(d_x, x, 100 * r::value); } @@ -406,7 +417,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRef) bj->apply(b.get(), x.get()); d_bj->apply(d_b.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); + GKO_ASSERT_MTX_NEAR(d_x, x, 100 * r::value); } @@ -416,13 +427,13 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) std::ranlux48 engine(42); auto dense_smtx = gko::share(gko::test::generate_random_matrix( dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution<>(1.0, 2.0), engine, ref)); + std::normal_distribution(1.0, 2.0), engine, ref)); gko::test::make_diag_dominant(dense_smtx.get()); auto smtx = gko::share(Mtx::create(ref)); smtx->copy_from(dense_smtx.get()); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution<>(0.0, 1.0), engine, ref)); + std::normal_distribution(0.0, 1.0), engine, ref)); auto sx = Vec::create(ref, sb->get_size()); auto d_smtx = gko::share(Mtx::create(dpcpp)); @@ -437,7 +448,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) sj->apply(sb.get(), sx.get()); d_sj->apply(d_sb.get(), d_sx.get()); - GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12); + GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 100 * r::value); } @@ -455,7 +466,7 @@ TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRef) bj->apply(alpha.get(), b.get(), beta.get(), x.get()); d_bj->apply(d_alpha.get(), d_b.get(), d_beta.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); + GKO_ASSERT_MTX_NEAR(d_x, x, 100 * r::value); } @@ -465,18 +476,18 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) std::ranlux48 engine(42); auto dense_smtx = gko::share(gko::test::generate_random_matrix( dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution<>(1.0, 2.0), engine, ref)); + std::normal_distribution(1.0, 2.0), engine, ref)); gko::test::make_diag_dominant(dense_smtx.get()); auto smtx = gko::share(Mtx::create(ref)); smtx->copy_from(dense_smtx.get()); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), - 4)); + std::normal_distribution(0.0, 1.0), engine, ref, + gko::dim<2>(dim, 3), 4)); auto sx = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), - 4)); + std::normal_distribution(0.0, 1.0), engine, ref, + gko::dim<2>(dim, 3), 4)); auto d_smtx = gko::share(gko::clone(dpcpp, smtx)); auto d_sb = gko::share(gko::clone(dpcpp, sb)); @@ -492,7 +503,7 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) sj->apply(alpha.get(), sb.get(), beta.get(), sx.get()); d_sj->apply(d_alpha.get(), d_sb.get(), d_beta.get(), d_sx.get()); - GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12); + GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 100 * r::value); } @@ -506,7 +517,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRef) bj->apply(b.get(), x.get()); d_bj->apply(d_b.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); + GKO_ASSERT_MTX_NEAR(d_x, x, 100 * r::value); } @@ -524,12 +535,13 @@ TEST_F(Jacobi, DpcppLinearCombinationApplyToMultipleVectorsEquivalentToRef) bj->apply(alpha.get(), b.get(), beta.get(), x.get()); d_bj->apply(d_alpha.get(), d_b.get(), d_beta.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); + GKO_ASSERT_MTX_NEAR(d_x, x, 100 * r::value); } TEST_F(Jacobi, ComputesTheSameConditionNumberAsRef) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99); @@ -548,7 +560,7 @@ TEST_F(Jacobi, SelectsTheSamePrecisionsAsRef) initialize_data( {0, 2, 14, 27, 40, 51, 61, 70, 80, 92, 100}, {ap, ap, ap, ap, ap, ap, ap, ap, ap, ap}, - {1e+0, 1e+0, 1e+2, 1e+3, 1e+4, 1e+4, 1e+6, 1e+7, 1e+8, 1e+9}, 13, 97, + {1e+0, 1e+0, 1e+2, 1e+3, 1e+4, 1e+4, 1e+6, 1e+7, 1e+8, 1e+9}, 32, 97, 99, 1, 0.2); auto bj = bj_factory->generate(mtx); @@ -566,7 +578,7 @@ TEST_F(Jacobi, SelectsTheSamePrecisionsAsRef) TEST_F(Jacobi, AvoidsPrecisionsThatOverflow) { - auto mtx = gko::matrix::Csr<>::create(dpcpp); + auto mtx = gko::matrix::Csr::create(dpcpp); // clang-format off mtx->read(mtx_data::diag({ // perfectly conditioned block, small value difference, @@ -582,26 +594,31 @@ TEST_F(Jacobi, AvoidsPrecisionsThatOverflow) auto bj = Bj::build() - .with_max_block_size(13u) + .with_max_block_size(32u) .with_block_pointers(gko::Array(dpcpp, {0, 2, 4})) .with_storage_optimization(gko::precision_reduction::autodetect()) - .with_accuracy(0.1) + .with_accuracy(value_type{0.1}) .on(dpcpp) ->generate(give(mtx)); - // both blocks are in the same group, both need (7, 8) auto h_bj = clone(ref, bj); auto prec = h_bj->get_parameters().storage_optimization.block_wise.get_const_data(); - EXPECT_EQ(prec[0], gko::precision_reduction(1, 1)); - ASSERT_EQ(prec[1], gko::precision_reduction(1, 1)); + EXPECT_EQ(prec[0], gko::precision_reduction(0, 2)); +// 2 - 0 is same as 1 - 1 when it is float +#if GINKGO_DPCPP_SINGLE_MODE + ASSERT_EQ(prec[1], gko::precision_reduction(2, 0)); +#else + ASSERT_EQ(prec[1], gko::precision_reduction(2, 0)); +#endif } TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithFullPrecision) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99); + {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -619,7 +636,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithReducedPrecision) auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); - GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-7); + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 2e-7); } @@ -664,6 +681,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithCustomQuarteredPrecision) TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithAdaptivePrecision) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, 99); @@ -678,6 +696,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithAdaptivePrecision) TEST_F(Jacobi, DpcppTransposedPreconditionerEquivalentToRefWithAdaptivePrecision) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, 99); @@ -694,6 +713,7 @@ TEST_F(Jacobi, TEST_F(Jacobi, DpcppConjTransposedPreconditionerEquivalentToRefWithAdaptivePrecision) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, 99); @@ -709,6 +729,7 @@ TEST_F(Jacobi, TEST_F(Jacobi, DpcppApplyEquivalentToRefWithFullPrecision) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99); @@ -724,6 +745,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithFullPrecision) TEST_F(Jacobi, DpcppApplyEquivalentToRefWithReducedPrecision) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97, 99); @@ -739,6 +761,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithReducedPrecision) TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedPrecision) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97, 99); @@ -799,6 +822,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomQuarteredPrecision) TEST_F(Jacobi, DpcppApplyEquivalentToRefWithAdaptivePrecision) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, 99); @@ -814,6 +838,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithAdaptivePrecision) TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRefWithAdaptivePrecision) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97, 99); @@ -833,6 +858,7 @@ TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRefWithAdaptivePrecision) TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithFullPrecision) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99, 5); @@ -863,6 +889,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithReducedPrecision) TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, 99, 5); @@ -880,6 +907,7 @@ TEST_F( Jacobi, DpcppLinearCombinationApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision) { + SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97, 99, 5); diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp index 074168047d3..2966a70eb78 100644 --- a/omp/preconditioner/jacobi_kernels.cpp +++ b/omp/preconditioner/jacobi_kernels.cpp @@ -348,7 +348,9 @@ inline bool validate_precision_reduction_feasibility(IndexType block_size, } cond *= compute_inf_norm(block_size, block_size, tmp_buffer, block_size); return cond >= 1.0 && - cond * float_traits>::eps < 1e-3; + cond * static_cast>( + float_traits>::eps) < + remove_complex{1e-3}; } diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp index a2a63a0e896..ccbe1653990 100644 --- a/reference/preconditioner/jacobi_kernels.cpp +++ b/reference/preconditioner/jacobi_kernels.cpp @@ -332,7 +332,9 @@ inline bool validate_precision_reduction_feasibility( } cond *= compute_inf_norm(block_size, block_size, tmp.data(), block_size); return cond >= 1.0 && - cond * float_traits>::eps < 1e-3; + cond * static_cast>( + float_traits>::eps) < + remove_complex{1e-3}; } From 3e15d98411741bb61f0b6ba3e45cb5d49545eb16 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 4 Jul 2022 09:47:35 +0200 Subject: [PATCH 05/11] remove dpcpp math --- dpcpp/components/warp_blas.dp.hpp | 1 - .../preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp | 1 - dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp | 1 - dpcpp/preconditioner/jacobi_kernels.dp.cpp | 1 - dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp | 1 - 5 files changed, 5 deletions(-) diff --git a/dpcpp/components/warp_blas.dp.hpp b/dpcpp/components/warp_blas.dp.hpp index b31fd0ef4fd..756421c54f8 100644 --- a/dpcpp/components/warp_blas.dp.hpp +++ b/dpcpp/components/warp_blas.dp.hpp @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -// #include #include diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp index ffaef7e7650..9df2ca3741a 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp @@ -48,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" -#include "dpcpp/base/math.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/components/warp_blas.dp.hpp" diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp index 5d27a02c65b..a8dcba503f1 100644 --- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp @@ -50,7 +50,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" -#include "dpcpp/base/math.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/diagonal_block_manipulation.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index 105f1c6f96a..15b0ab51085 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -47,7 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" -#include "dpcpp/base/math.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp index 8c3787cf721..92fd39c2f7e 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp @@ -48,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" -#include "dpcpp/base/math.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/components/warp_blas.dp.hpp" From 56956d4e7cc879b91ebb00f40b8e887b1a7e9db5 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 4 Jul 2022 17:58:59 +0200 Subject: [PATCH 06/11] fix Array and UninitializedArray naming --- .../jacobi_advanced_apply_kernel.dp.cpp | 6 +++--- .../jacobi_generate_instantiate.inc.dp.cpp | 8 ++++---- .../jacobi_generate_kernel.dp.cpp | 6 +++--- dpcpp/preconditioner/jacobi_kernels.dp.cpp | 20 +++++++++---------- .../jacobi_simple_apply_kernel.dp.cpp | 4 ++-- dpcpp/test/preconditioner/jacobi_kernels.cpp | 6 +++--- 6 files changed, 25 insertions(+), 25 deletions(-) diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp index d90342f7575..ece82241ac8 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp @@ -74,9 +74,9 @@ void apply(std::shared_ptr exec, size_type num_blocks, uint32 max_block_size, const preconditioner::block_interleaved_storage_scheme& storage_scheme, - const Array& block_precisions, - const Array& block_pointers, - const Array& blocks, + const array& block_precisions, + const array& block_pointers, + const array& blocks, const matrix::Dense* alpha, const matrix::Dense* b, const matrix::Dense* beta, matrix::Dense* x) diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp index a8dcba503f1..4a0c03432aa 100644 --- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp @@ -126,7 +126,7 @@ void generate( preconditioner::block_interleaved_storage_scheme storage_scheme, const IndexType* __restrict__ block_ptrs, size_type num_blocks, sycl::nd_item<3> item_ct1, - UninitializedArray* workspace) + uninitialized_array* workspace) { const auto block_id = thread::get_subwarp_id(item_ct1); @@ -162,7 +162,7 @@ void generate( { queue->submit([&](sycl::handler& cgh) { sycl::accessor< - UninitializedArray, 0, + uninitialized_array, 0, sycl::access_mode::read_write, sycl::access::target::local> workspace_acc_ct1(cgh); @@ -189,7 +189,7 @@ void adaptive_generate( precision_reduction* __restrict__ block_precisions, const IndexType* __restrict__ block_ptrs, size_type num_blocks, sycl::nd_item<3> item_ct1, - UninitializedArray* workspace) + uninitialized_array* workspace) { // extract blocks const auto block_id = @@ -284,7 +284,7 @@ void adaptive_generate( { queue->submit([&](sycl::handler& cgh) { sycl::accessor< - UninitializedArray, 0, + uninitialized_array, 0, sycl::access_mode::read_write, sycl::access::target::local> workspace_acc_ct1(cgh); diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp index 91e7768fd54..fdd97a50749 100644 --- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp @@ -77,9 +77,9 @@ void generate(std::shared_ptr exec, remove_complex accuracy, const preconditioner::block_interleaved_storage_scheme& storage_scheme, - Array>& conditioning, - Array& block_precisions, - const Array& block_pointers, Array& blocks) + array>& conditioning, + array& block_precisions, + const array& block_pointers, array& blocks) { components::fill_array(exec, blocks.get_data(), blocks.get_num_elems(), zero()); diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index 15b0ab51085..a6e35da67db 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -357,9 +357,9 @@ size_type find_natural_blocks(std::shared_ptr exec, int32 max_block_size, IndexType* __restrict__ block_ptrs) { - Array nums(exec, 1); + array nums(exec, 1); - Array matching_next_row(exec, mtx->get_size()[0] - 1); + array matching_next_row(exec, mtx->get_size()[0] - 1); const dim3 block_size(config::warp_size, 1, 1); const dim3 grid_size( @@ -381,7 +381,7 @@ inline size_type agglomerate_supervariables( std::shared_ptr exec, int32 max_block_size, size_type num_natural_blocks, IndexType* block_ptrs) { - Array nums(exec, 1); + array nums(exec, 1); agglomerate_supervariables_kernel(1, 1, 0, exec->get_queue(), max_block_size, num_natural_blocks, @@ -396,8 +396,8 @@ inline size_type agglomerate_supervariables( void initialize_precisions(std::shared_ptr exec, - const Array& source, - Array& precisions) + const array& source, + array& precisions) { const auto block_size = default_num_warps * config::warp_size; const auto grid_size = min( @@ -414,7 +414,7 @@ template void find_blocks(std::shared_ptr exec, const matrix::Csr* system_matrix, uint32 max_block_size, size_type& num_blocks, - Array& block_pointers) + array& block_pointers) { auto num_natural_blocks = find_natural_blocks( exec, system_matrix, max_block_size, block_pointers.get_data()); @@ -472,7 +472,7 @@ void transpose_jacobi( const array& block_pointers, const array& blocks, const preconditioner::block_interleaved_storage_scheme& storage_scheme, - Array& out_blocks) + array& out_blocks) { select_transpose_jacobi( compiled_kernels(), @@ -496,7 +496,7 @@ void conj_transpose_jacobi( const array& block_pointers, const array& blocks, const preconditioner::block_interleaved_storage_scheme& storage_scheme, - Array& out_blocks) + array& out_blocks) { select_transpose_jacobi( compiled_kernels(), @@ -516,8 +516,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_dense( std::shared_ptr exec, size_type num_blocks, - const Array& block_precisions, - const Array& block_pointers, const Array& blocks, + const array& block_precisions, + const array& block_pointers, const array& blocks, const preconditioner::block_interleaved_storage_scheme& storage_scheme, ValueType* result_values, size_type result_stride) GKO_NOT_IMPLEMENTED; diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp index 651a4ba194a..ff3be086a91 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp @@ -74,8 +74,8 @@ void simple_apply( uint32 max_block_size, const preconditioner::block_interleaved_storage_scheme& storage_scheme, - const Array& block_precisions, - const Array& block_pointers, const Array& blocks, + const array& block_precisions, + const array& block_pointers, const array& blocks, const matrix::Dense* b, matrix::Dense* x) { // TODO: write a special kernel for multiple RHS diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp index e5ec9bc604c..8894b26a346 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp @@ -103,8 +103,8 @@ class Jacobi : public ::testing::Test { mtx = Mtx::create(ref); mtx->read(mtx_data::diag(begin(blocks), end(blocks))); } - gko::Array block_ptrs(ref, block_pointers); - gko::Array block_prec(ref, block_precisions); + gko::array block_ptrs(ref, block_pointers); + gko::array block_prec(ref, block_precisions); if (block_prec.get_num_elems() == 0) { bj_factory = Bj::build() .with_max_block_size(max_block_size) @@ -595,7 +595,7 @@ TEST_F(Jacobi, AvoidsPrecisionsThatOverflow) auto bj = Bj::build() .with_max_block_size(32u) - .with_block_pointers(gko::Array(dpcpp, {0, 2, 4})) + .with_block_pointers(gko::array(dpcpp, {0, 2, 4})) .with_storage_optimization(gko::precision_reduction::autodetect()) .with_accuracy(value_type{0.1}) .on(dpcpp) From f029c517b403e4b72c601db97953b9f6529905f5 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 11 Jul 2022 16:23:26 +0200 Subject: [PATCH 07/11] update subgroup setting --- .../diagonal_block_manipulation.dp.hpp | 2 +- ...cobi_advanced_apply_instantiate.inc.dp.cpp | 19 ++++---- .../jacobi_advanced_apply_kernel.dp.cpp | 2 +- .../jacobi_generate_instantiate.inc.dp.cpp | 24 ++++++---- .../jacobi_generate_kernel.dp.cpp | 2 +- dpcpp/preconditioner/jacobi_kernels.dp.cpp | 14 +++--- ...jacobi_simple_apply_instantiate.inc.dp.cpp | 8 ++-- .../jacobi_simple_apply_kernel.dp.cpp | 2 +- dpcpp/test/preconditioner/jacobi_kernels.cpp | 46 +++++++++++-------- 9 files changed, 70 insertions(+), 49 deletions(-) diff --git a/dpcpp/components/diagonal_block_manipulation.dp.hpp b/dpcpp/components/diagonal_block_manipulation.dp.hpp index 32e3e26739c..4d684f2c357 100644 --- a/dpcpp/components/diagonal_block_manipulation.dp.hpp +++ b/dpcpp/components/diagonal_block_manipulation.dp.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp index 9df2ca3741a..2235d886234 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -107,7 +107,8 @@ void advanced_apply( const ValueType* b, int32 b_stride, ValueType* x, int32 x_stride) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { advanced_apply( blocks, storage_scheme, block_ptrs, num_blocks, alpha, b, b_stride, x, x_stride, item_ct1); @@ -163,12 +164,14 @@ void advanced_adaptive_apply( size_type num_blocks, const ValueType* alpha, const ValueType* b, int32 b_stride, ValueType* x, int32 x_stride) { - queue->parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> - item_ct1) { - advanced_adaptive_apply( - blocks, storage_scheme, block_precisions, block_ptrs, num_blocks, - alpha, b, b_stride, x, x_stride, item_ct1); - }); + queue->parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { + advanced_adaptive_apply( + blocks, storage_scheme, block_precisions, block_ptrs, + num_blocks, alpha, b, b_stride, x, x_stride, item_ct1); + }); } diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp index ece82241ac8..93565a3f16f 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp index 4a0c03432aa..eab5ec118e4 100644 --- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -167,7 +167,9 @@ void generate( workspace_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subwarp_size)]] { generate( num_rows, row_ptrs, col_idxs, values, block_data, storage_scheme, block_ptrs, num_blocks, item_ct1, @@ -288,13 +290,17 @@ void adaptive_generate( sycl::access_mode::read_write, sycl::access::target::local> workspace_acc_ct1(cgh); - cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> - item_ct1) { - adaptive_generate( - num_rows, row_ptrs, col_idxs, values, accuracy, block_data, - storage_scheme, conditioning, block_precisions, block_ptrs, - num_blocks, item_ct1, workspace_acc_ct1.get_pointer().get()); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subwarp_size)]] { + adaptive_generate( + num_rows, row_ptrs, col_idxs, values, accuracy, block_data, + storage_scheme, conditioning, block_precisions, block_ptrs, + num_blocks, item_ct1, + workspace_acc_ct1.get_pointer().get()); + }); }); } diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp index fdd97a50749..e580719a4b8 100644 --- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index a6e35da67db..8095fe9fe24 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -152,7 +152,9 @@ void compare_adjacent_rows(dim3 grid, dim3 block, bool* matching_next_row) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + config::warp_size)]] { compare_adjacent_rows(num_rows, max_block_size, row_ptrs, col_idx, matching_next_row, item_ct1); }); @@ -280,7 +282,8 @@ void transpose_jacobi( const IndexType* block_ptrs, size_type num_blocks, ValueType* out_blocks) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { transpose_jacobi(blocks, storage_scheme, block_ptrs, num_blocks, @@ -310,10 +313,6 @@ void adaptive_transpose_jacobi( const auto block_stride = storage_scheme.get_stride(); const auto rank = subwarp.thread_rank(); if (rank < block_size) { - /* - DPCT1007:6: Migration of this CUDA API is not supported by the Intel(R) - DPC++ Compatibility Tool. - */ GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( ValueType, block_precisions[block_id], auto local_block = @@ -342,7 +341,8 @@ void adaptive_transpose_jacobi( size_type num_blocks, ValueType* out_blocks) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { adaptive_transpose_jacobi( blocks, storage_scheme, block_precisions, block_ptrs, diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp index 92fd39c2f7e..f7c7aa2573b 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without @@ -107,7 +107,8 @@ void apply( int32 b_stride, ValueType* x, int32 x_stride) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { apply( blocks, storage_scheme, block_ptrs, num_blocks, b, b_stride, x, x_stride, item_ct1); @@ -161,7 +162,8 @@ void adaptive_apply( int32 x_stride) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { adaptive_apply( blocks, storage_scheme, block_precisions, block_ptrs, num_blocks, b, b_stride, x, x_stride, item_ct1); diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp index ff3be086a91..37519d96779 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp index 8894b26a346..617a7b8b31c 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" +#include "core/utils/matrix_utils.hpp" #include "dpcpp/test/utils.hpp" @@ -53,6 +54,7 @@ namespace { class Jacobi : public ::testing::Test { protected: + using index_type = int32_t; #if GINKGO_DPCPP_SINGLE_MODE using value_type = float; #else @@ -308,13 +310,13 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Sorted) auto d_bj = d_bj_factory->generate(mtx); GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), - 50 * r::value); + 100 * r::value); } TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Unsorted) { - std::ranlux48 engine(42); + std::default_random_engine engine(42); initialize_data({0, 32, 64, 96, 128}, {}, {}, 32, 100, 110, 1, 0.1, false); gko::test::unsort_matrix(mtx.get(), engine); @@ -322,7 +324,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithBlockSize32Unsorted) auto d_bj = d_bj_factory->generate(mtx); GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), - 50 * r::value); + 100 * r::value); } @@ -424,11 +426,14 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRef) TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) { gko::size_type dim = 313; - std::ranlux48 engine(42); - auto dense_smtx = gko::share(gko::test::generate_random_matrix( - dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution(1.0, 2.0), engine, ref)); - gko::test::make_diag_dominant(dense_smtx.get()); + std::default_random_engine engine(42); + auto dense_data = + gko::test::generate_random_matrix_data( + dim, dim, std::uniform_int_distribution<>(1, dim), + std::normal_distribution<>(1.0, 2.0), engine); + gko::utils::make_diag_dominant(dense_data); + auto dense_smtx = gko::share(Vec::create(ref)); + dense_smtx->read(dense_data); auto smtx = gko::share(Mtx::create(ref)); smtx->copy_from(dense_smtx.get()); auto sb = gko::share(gko::test::generate_random_matrix( @@ -473,11 +478,14 @@ TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRef) TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) { gko::size_type dim = 313; - std::ranlux48 engine(42); - auto dense_smtx = gko::share(gko::test::generate_random_matrix( - dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution(1.0, 2.0), engine, ref)); - gko::test::make_diag_dominant(dense_smtx.get()); + std::default_random_engine engine(42); + auto dense_data = + gko::test::generate_random_matrix_data( + dim, dim, std::uniform_int_distribution<>(1, dim), + std::normal_distribution(1.0, 2.0), engine); + gko::utils::make_diag_dominant(dense_data); + auto dense_smtx = gko::share(Vec::create(ref)); + dense_smtx->read(dense_data); auto smtx = gko::share(Mtx::create(ref)); smtx->copy_from(dense_smtx.get()); auto sb = gko::share(gko::test::generate_random_matrix( @@ -601,16 +609,18 @@ TEST_F(Jacobi, AvoidsPrecisionsThatOverflow) .on(dpcpp) ->generate(give(mtx)); + // both blocks are in the same group, both need (7, 8) auto h_bj = clone(ref, bj); auto prec = h_bj->get_parameters().storage_optimization.block_wise.get_const_data(); - EXPECT_EQ(prec[0], gko::precision_reduction(0, 2)); -// 2 - 0 is same as 1 - 1 when it is float + // 2 - 0 is same as 1 - 1 when it is float #if GINKGO_DPCPP_SINGLE_MODE + EXPECT_EQ(prec[0], gko::precision_reduction(2, 0)); ASSERT_EQ(prec[1], gko::precision_reduction(2, 0)); #else - ASSERT_EQ(prec[1], gko::precision_reduction(2, 0)); -#endif + EXPECT_EQ(prec[0], gko::precision_reduction(1, 1)); + ASSERT_EQ(prec[1], gko::precision_reduction(1, 1)); +#endif // GINKGO_DPCPP_SINGLE_MODE } @@ -636,7 +646,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithReducedPrecision) auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); - GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 2e-7); + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-6); } From 1b0476c433c443d46ac27c78d1a1f2deb7ff6d28 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 14 Jul 2022 16:24:51 +0200 Subject: [PATCH 08/11] adapt the answer with single precision --- dpcpp/test/preconditioner/jacobi_kernels.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp index 617a7b8b31c..a4e31d11d71 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp @@ -609,16 +609,16 @@ TEST_F(Jacobi, AvoidsPrecisionsThatOverflow) .on(dpcpp) ->generate(give(mtx)); - // both blocks are in the same group, both need (7, 8) + // dpcpp considers all block separately auto h_bj = clone(ref, bj); auto prec = h_bj->get_parameters().storage_optimization.block_wise.get_const_data(); - // 2 - 0 is same as 1 - 1 when it is float #if GINKGO_DPCPP_SINGLE_MODE - EXPECT_EQ(prec[0], gko::precision_reduction(2, 0)); + ASSERT_EQ(prec[0], gko::precision_reduction(0, 2)); + // In single value, precision_reduction(1, 1) == precision_reduction(2, 0) ASSERT_EQ(prec[1], gko::precision_reduction(2, 0)); #else - EXPECT_EQ(prec[0], gko::precision_reduction(1, 1)); + ASSERT_EQ(prec[0], gko::precision_reduction(0, 2)); ASSERT_EQ(prec[1], gko::precision_reduction(1, 1)); #endif // GINKGO_DPCPP_SINGLE_MODE } @@ -646,7 +646,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithReducedPrecision) auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); - GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-6); + GKO_ASSERT_MTX_NEAR(lend(d_bj), lend(bj), 1e-5); } @@ -893,7 +893,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithReducedPrecision) bj->apply(b.get(), x.get()); d_bj->apply(d_b.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6); + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-5); } From b8837fa5d5017bc63b06dfefc81d8a79194f5209 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 21 Jul 2022 11:51:52 +0200 Subject: [PATCH 09/11] workaround for cpu issue --- core/preconditioner/jacobi_utils.hpp | 4 +- .../jacobi_generate_instantiate.inc.dp.cpp | 65 ++++++++++++++++++- dpcpp/preconditioner/jacobi_kernels.dp.cpp | 5 +- dpcpp/test/preconditioner/jacobi_kernels.cpp | 57 ++++++++-------- 4 files changed, 97 insertions(+), 34 deletions(-) diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp index 302ec84f4ee..4534eca3660 100644 --- a/core/preconditioner/jacobi_utils.hpp +++ b/core/preconditioner/jacobi_utils.hpp @@ -89,13 +89,15 @@ struct precision_reduction_descriptor { static constexpr GKO_ATTRIBUTES uint32 singleton(const precision_reduction& pr) { + // clang-format off return pr == precision_reduction(0, 0) ? p0n0 : pr == precision_reduction(0, 1) ? p0n1 : pr == precision_reduction(0, 2) ? p0n2 : pr == precision_reduction(1, 0) ? p1n0 : pr == precision_reduction(1, 1) ? p1n1 : pr == precision_reduction(2, 0) ? p2n0 - : p0n0; + : p0n0; + // clang-format on } }; diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp index eab5ec118e4..2f49d924f7b 100644 --- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp @@ -178,6 +178,57 @@ void generate( }); } +namespace detail { + + +/** + * TODO: Less threads involve the verificator1 when the function try calling the + * verificator1 twice on some CPU. Need to investigate furthermore and get the + * reproducer. The current workaround we pass the same lambda function as + * verificator1 to verificator3. Others is copied from + * core/preconditioner/jacobi_utils.hpp. + */ +template +GKO_ATTRIBUTES GKO_INLINE uint32 get_supported_storage_reductions_dpcpp( + AccuracyType accuracy, CondType cond, Predicate1 verificator1, + Predicate2 verificator2, Predicate3 verificator3) +{ + using gko::detail::float_traits; + using type = remove_complex; + using prd = preconditioner::detail::precision_reduction_descriptor; + auto accurate = [&cond, &accuracy](type eps) { + return cond * eps < accuracy; + }; + uint8 is_verified1 = 2; + auto supported = static_cast(prd::p0n0); + // the following code uses short-circuiting to avoid calling possibly + // expensive verificatiors multiple times + if (accurate(float_traits>>::eps)) { + supported |= prd::p2n0; + } + if (accurate(float_traits>>::eps) && + (is_verified1 = verificator1())) { + supported |= prd::p1n1; + } + if (accurate(float_traits>>::eps) && + is_verified1 != 0 && verificator2()) { + supported |= prd::p0n2; + } + if (accurate(float_traits>::eps)) { + supported |= prd::p1n0; + } + if (accurate(float_traits>::eps) && + (is_verified1 == 1 || + (is_verified1 == 2 && (is_verified1 = verificator3())))) { + supported |= prd::p0n1; + } + return supported; +} + + +} // namespace detail + template @@ -225,8 +276,8 @@ void adaptive_generate( preconditioner::detail::precision_reduction_descriptor::singleton( prec); if (prec == precision_reduction::autodetect()) { - using preconditioner::detail::get_supported_storage_reductions; - prec_descriptor = get_supported_storage_reductions( + using detail::get_supported_storage_reductions_dpcpp; + prec_descriptor = get_supported_storage_reductions_dpcpp( accuracy, block_cond, [&subwarp, &block_size, &row, &block_data, &storage_scheme, &block_id] { @@ -248,6 +299,16 @@ void adaptive_generate( block_data + storage_scheme.get_global_block_offset(block_id), storage_scheme.get_stride()); + }, + [&subwarp, &block_size, &row, &block_data, &storage_scheme, + &block_id] { + using target = reduce_precision; + return validate_precision_reduction_feasibility< + max_block_size, target>( + subwarp, block_size, row, + block_data + + storage_scheme.get_global_block_offset(block_id), + storage_scheme.get_stride()); }); } } diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index 8095fe9fe24..c90a3452969 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -63,8 +63,9 @@ namespace jacobi { namespace { -// a total of 32 warps (1024 threads) -constexpr int default_num_warps = 16; +// a total of 8 32-subgroup (256 threads) +constexpr int default_num_warps = 8; +// TODO: get a default_grid_size for dpcpp // with current architectures, at most 32 warps can be scheduled per SM (and // current GPUs have at most 84 SMs) constexpr int default_grid_size = 32 * 32 * 128; diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp index a4e31d11d71..e4d63617eb7 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp @@ -343,7 +343,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithDifferentBlockSize) TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithMPW) { - initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); @@ -356,7 +356,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithMPW) TEST_F(Jacobi, DpcppTransposedPreconditionerEquivalentToRefWithMPW) { - initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); @@ -370,7 +370,7 @@ TEST_F(Jacobi, DpcppTransposedPreconditionerEquivalentToRefWithMPW) TEST_F(Jacobi, DpcppConjTransposedPreconditionerEquivalentToRefWithMPW) { - initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); @@ -411,7 +411,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithDifferentBlockSize) TEST_F(Jacobi, DpcppApplyEquivalentToRef) { - initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -459,7 +459,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRef) { - initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32, 97, 99); auto alpha = gko::initialize({2.0}, ref); auto d_alpha = gko::initialize({2.0}, dpcpp); @@ -517,7 +517,7 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRef) { - initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32, 97, 99, 5); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -531,7 +531,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRef) TEST_F(Jacobi, DpcppLinearCombinationApplyToMultipleVectorsEquivalentToRef) { - initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, + initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 32, 97, 99, 5); auto alpha = gko::initialize({2.0}, ref); auto d_alpha = gko::initialize({2.0}, dpcpp); @@ -551,7 +551,7 @@ TEST_F(Jacobi, ComputesTheSameConditionNumberAsRef) { SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, 99); + {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); auto d_bj = clone(ref, d_bj_factory->generate(mtx)); @@ -613,12 +613,11 @@ TEST_F(Jacobi, AvoidsPrecisionsThatOverflow) auto h_bj = clone(ref, bj); auto prec = h_bj->get_parameters().storage_optimization.block_wise.get_const_data(); -#if GINKGO_DPCPP_SINGLE_MODE ASSERT_EQ(prec[0], gko::precision_reduction(0, 2)); +#if GINKGO_DPCPP_SINGLE_MODE // In single value, precision_reduction(1, 1) == precision_reduction(2, 0) ASSERT_EQ(prec[1], gko::precision_reduction(2, 0)); #else - ASSERT_EQ(prec[0], gko::precision_reduction(0, 2)); ASSERT_EQ(prec[1], gko::precision_reduction(1, 1)); #endif // GINKGO_DPCPP_SINGLE_MODE } @@ -640,7 +639,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithFullPrecision) TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithReducedPrecision) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97, + {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); @@ -653,7 +652,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithReducedPrecision) TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithCustomReducedPrecision) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97, + {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); @@ -666,7 +665,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithCustomReducedPrecision) TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithQuarteredPrecision) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 13, 97, + {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); @@ -679,7 +678,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithQuarteredPrecision) TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithCustomQuarteredPrecision) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 13, 97, + {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); @@ -693,7 +692,7 @@ TEST_F(Jacobi, DpcppPreconditionerEquivalentToRefWithAdaptivePrecision) { SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); @@ -708,7 +707,7 @@ TEST_F(Jacobi, { SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); @@ -725,7 +724,7 @@ TEST_F(Jacobi, { SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); @@ -741,7 +740,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithFullPrecision) { SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, + {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -757,7 +756,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithReducedPrecision) { SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97, + {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -773,7 +772,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedPrecision) { SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 13, 97, + {tp, tp, tp, tp, tp, tp, tp, tp, tp, tp, tp}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -788,7 +787,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedPrecision) TEST_F(Jacobi, DpcppApplyEquivalentToRefWithQuarteredPrecision) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 13, 97, + {hp, hp, hp, hp, hp, hp, hp, hp, hp, hp, hp}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -803,7 +802,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithQuarteredPrecision) TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedAndReducedPrecision) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {up, up, up, up, up, up, up, up, up, up, up}, {}, 13, 97, + {up, up, up, up, up, up, up, up, up, up, up}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -818,7 +817,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomReducedAndReducedPrecision) TEST_F(Jacobi, DpcppApplyEquivalentToRefWithCustomQuarteredPrecision) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 13, 97, + {qp, qp, qp, qp, qp, qp, qp, qp, qp, qp, qp}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -834,7 +833,7 @@ TEST_F(Jacobi, DpcppApplyEquivalentToRefWithAdaptivePrecision) { SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 32, 97, 99); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -850,7 +849,7 @@ TEST_F(Jacobi, DpcppLinearCombinationApplyEquivalentToRefWithAdaptivePrecision) { SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97, + {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 32, 97, 99); auto alpha = gko::initialize({2.0}, ref); auto d_alpha = gko::initialize({2.0}, dpcpp); @@ -870,7 +869,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithFullPrecision) { SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 13, 97, + {dp, dp, dp, dp, dp, dp, dp, dp, dp, dp, dp}, {}, 32, 97, 99, 5); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -885,7 +884,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithFullPrecision) TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithReducedPrecision) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 13, 97, + {sp, sp, sp, sp, sp, sp, sp, sp, sp, sp, sp}, {}, 32, 97, 99, 5); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -901,7 +900,7 @@ TEST_F(Jacobi, DpcppApplyToMultipleVectorsEquivalentToRefWithAdaptivePrecision) { SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 13, 97, + {sp, sp, dp, dp, tp, tp, qp, qp, hp, dp, up}, {}, 32, 97, 99, 5); auto bj = bj_factory->generate(mtx); auto d_bj = d_bj_factory->generate(mtx); @@ -919,7 +918,7 @@ TEST_F( { SKIP_IF_SINGLE_MODE; initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, - {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 13, 97, + {sp, dp, dp, sp, sp, sp, dp, dp, sp, dp, sp}, {}, 32, 97, 99, 5); auto alpha = gko::initialize({2.0}, ref); auto d_alpha = gko::initialize({2.0}, dpcpp); From 4ed16c63f32ce81fcaa8eca92c7e1484ee9988a3 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 1 Sep 2022 09:51:38 -0400 Subject: [PATCH 10/11] remove unused parameter, update doc Co-authored-by: Terry Cojean --- .../preconditioner/jacobi_kernels.hpp.inc | 3 +-- cuda/preconditioner/jacobi_kernels.cu | 2 +- dpcpp/CMakeLists.txt | 10 ++-------- dpcpp/preconditioner/jacobi_kernels.dp.cpp | 20 +++++++++---------- hip/preconditioner/jacobi_kernels.hip.cpp | 7 +++---- 5 files changed, 16 insertions(+), 26 deletions(-) diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc index bb1ac5f62bb..9f2d7f043d0 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc +++ b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc @@ -30,9 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -template __global__ -__launch_bounds__(warps_per_block* config::warp_size) void duplicate_array( +__launch_bounds__(default_num_warps* config::warp_size) void duplicate_array( const precision_reduction* __restrict__ source, size_type source_size, precision_reduction* __restrict__ dest, size_type dest_size) { diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu index b9eddae340f..2b5a887043c 100644 --- a/cuda/preconditioner/jacobi_kernels.cu +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -124,7 +124,7 @@ void initialize_precisions(std::shared_ptr exec, default_grid_size, static_cast(ceildiv(precisions.get_num_elems(), block_size))); if (grid_size > 0) { - duplicate_array<<>>( + duplicate_array<<>>( source.get_const_data(), source.get_num_elems(), precisions.get_data(), precisions.get_num_elems()); } diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index dd28a79adeb..e0f9806cf96 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -57,14 +57,8 @@ target_sources(ginkgo_dpcpp ${GKO_UNIFIED_COMMON_SOURCES} ) -# if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) -# set(GKO_DPCPP_JACOBI_BLOCK_SIZES) -# foreach(blocksize RANGE 1 32) -# list(APPEND GKO_DPCPP_JACOBI_BLOCK_SIZES ${blocksize}) -# endforeach() -# else() - set(GKO_DPCPP_JACOBI_BLOCK_SIZES 32) -# endif() +# TODO: adjust it when dpcpp jacobi supports more block size +set(GKO_DPCPP_JACOBI_BLOCK_SIZES 32) set(GKO_DPCPP_JACOBI_SOURCES) foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_DPCPP_JACOBI_BLOCK_SIZES) configure_file( diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index c90a3452969..2b9d64da510 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -71,7 +71,6 @@ constexpr int default_num_warps = 8; constexpr int default_grid_size = 32 * 32 * 128; -template void duplicate_array(const precision_reduction* __restrict__ source, size_type source_size, precision_reduction* __restrict__ dest, @@ -86,17 +85,15 @@ void duplicate_array(const precision_reduction* __restrict__ source, } } -template void duplicate_array(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const precision_reduction* source, size_type source_size, precision_reduction* dest, size_type dest_size) { - queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - duplicate_array( - source, source_size, dest, dest_size, item_ct1); - }); + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + duplicate_array(source, source_size, dest, dest_size, item_ct1); + }); } @@ -404,10 +401,11 @@ void initialize_precisions(std::shared_ptr exec, const auto grid_size = min( default_grid_size, static_cast(ceildiv(precisions.get_num_elems(), block_size))); - duplicate_array( - grid_size, block_size, 0, exec->get_queue(), source.get_const_data(), - source.get_num_elems(), precisions.get_data(), - precisions.get_num_elems()); + if (grid_size > 0) { + duplicate_array(grid_size, block_size, 0, exec->get_queue(), + source.get_const_data(), source.get_num_elems(), + precisions.get_data(), precisions.get_num_elems()); + } } diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp index d8fc7677e8e..595a3e564b8 100644 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_kernels.hip.cpp @@ -133,10 +133,9 @@ void initialize_precisions(std::shared_ptr exec, default_grid_size, static_cast(ceildiv(precisions.get_num_elems(), block_size))); if (grid_size > 0) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(duplicate_array), - grid_size, block_size, 0, 0, source.get_const_data(), - source.get_num_elems(), precisions.get_data(), - precisions.get_num_elems()); + hipLaunchKernelGGL(duplicate_array, grid_size, block_size, 0, 0, + source.get_const_data(), source.get_num_elems(), + precisions.get_data(), precisions.get_num_elems()); } } From ce70658f7c87492988094880a049973ab1f10e6e Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 31 Oct 2022 00:22:26 +0800 Subject: [PATCH 11/11] fix header and type issue, and format_header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Grützmacher --- .../preconditioner/jacobi_generate_kernel.hpp.inc | 4 ++-- dev_tools/scripts/format_header.sh | 4 ++-- .../jacobi_advanced_apply_instantiate.inc.dp.cpp | 4 +--- dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp | 4 +--- dpcpp/preconditioner/jacobi_common.hpp.in | 2 +- .../preconditioner/jacobi_generate_instantiate.inc.dp.cpp | 8 +++----- dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp | 4 +--- dpcpp/preconditioner/jacobi_kernels.dp.cpp | 4 +--- .../jacobi_simple_apply_instantiate.inc.dp.cpp | 4 +--- dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp | 4 +--- dpcpp/test/preconditioner/jacobi_kernels.cpp | 4 ++++ 11 files changed, 18 insertions(+), 28 deletions(-) diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc index 208bd33d7dc..e863a9631a1 100644 --- a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc +++ b/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc @@ -43,7 +43,7 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility( // save original data and reduce precision if (group.thread_rank() < block_size) { #pragma unroll - for (auto i = 0u; i < max_block_size; ++i) { + for (int i = 0; i < max_block_size; ++i) { if (i < block_size) { work[i * stride + group.thread_rank()] = row[i]; row[i] = @@ -65,7 +65,7 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility( // restore original data if (group.thread_rank() < block_size) { #pragma unroll - for (auto i = 0u; i < max_block_size; ++i) { + for (int i = 0; i < max_block_size; ++i) { if (i < block_size) { row[i] = work[i * stride + group.thread_rank()]; } diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh index 05860d301e3..a501b6f97d2 100755 --- a/dev_tools/scripts/format_header.sh +++ b/dev_tools/scripts/format_header.sh @@ -4,7 +4,7 @@ CLANG_FORMAT=${CLANG_FORMAT:="clang-format"} convert_header () { local regex="^(#include )(<|\")(.*)(\"|>)$" - local jacobi_regex="^(cuda|hip)\/preconditioner\/jacobi_common(\.hip)?\.hpp" + local jacobi_regex="^(cuda|hip|dpcpp)\/preconditioner\/jacobi_common(\.hip)?\.hpp" if [[ $@ =~ ${regex} ]]; then header_file="${BASH_REMATCH[3]}" if [ -f "${header_file}" ]; then @@ -15,7 +15,7 @@ convert_header () { fi elif [ "${header_file}" = "matrices/config.hpp" ]; then echo "#include \"${header_file}\"" - elif [[ "${header_file}" =~ ${jacobi_regex} ]]; then + elif [[ "${header_file}" =~ ${jacobi_regex} ]]; then echo "#include \"${header_file}\"" else echo "#include <${header_file}>" diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp index 2235d886234..29a7cb76f10 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp @@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include @@ -51,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/components/warp_blas.dp.hpp" +#include "dpcpp/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp index 93565a3f16f..57ee2e36263 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp @@ -33,14 +33,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" +#include "dpcpp/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/dpcpp/preconditioner/jacobi_common.hpp.in b/dpcpp/preconditioner/jacobi_common.hpp.in index 93925a50342..fa5170100c3 100644 --- a/dpcpp/preconditioner/jacobi_common.hpp.in +++ b/dpcpp/preconditioner/jacobi_common.hpp.in @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp index 2f49d924f7b..da6d2bf2825 100644 --- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp @@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include @@ -55,6 +52,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/components/uninitialized_array.hpp" #include "dpcpp/components/warp_blas.dp.hpp" +#include "dpcpp/preconditioner/jacobi_common.hpp" namespace gko { @@ -81,7 +79,7 @@ __dpct_inline__ bool validate_precision_reduction_feasibility( // save original data and reduce precision if (group.thread_rank() < block_size) { #pragma unroll - for (auto i = 0u; i < max_block_size; ++i) { + for (int i = 0; i < max_block_size; ++i) { if (i < block_size) { work[i * stride + group.thread_rank()] = row[i]; row[i] = @@ -103,7 +101,7 @@ __dpct_inline__ bool validate_precision_reduction_feasibility( // restore original data if (group.thread_rank() < block_size) { #pragma unroll - for (auto i = 0u; i < max_block_size; ++i) { + for (int i = 0; i < max_block_size; ++i) { if (i < block_size) { row[i] = work[i * stride + group.thread_rank()]; } diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp index e580719a4b8..f3ef6902e1a 100644 --- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp @@ -33,15 +33,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include #include #include "core/components/fill_array_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" +#include "dpcpp/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index 2b9d64da510..54bb19728e6 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include @@ -49,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp index f7c7aa2573b..aaffb1b6583 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp @@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include @@ -51,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/components/warp_blas.dp.hpp" +#include "dpcpp/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp index 37519d96779..d260298783c 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp @@ -33,14 +33,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" +#include "dpcpp/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.cpp index e4d63617eb7..889548fd2e1 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.cpp @@ -33,12 +33,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include +#include #include +#include +#include #include #include