Skip to content

Commit

Permalink
Merge Port Jacobi to Dpcpp
Browse files Browse the repository at this point in the history
This PR ports Jacobi to dpc++, but it only supports the 32 for block_size.

Related PR: #929
  • Loading branch information
yhmtsai authored Oct 31, 2022
2 parents a40e4cd + ce70658 commit cff9742
Show file tree
Hide file tree
Showing 23 changed files with 2,803 additions and 186 deletions.
8 changes: 5 additions & 3 deletions common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility(
// save original data and reduce precision
if (group.thread_rank() < block_size) {
#pragma unroll
for (auto i = 0u; i < max_block_size; ++i) {
for (int i = 0; i < max_block_size; ++i) {
if (i < block_size) {
work[i * stride + group.thread_rank()] = row[i];
row[i] =
Expand All @@ -65,15 +65,17 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility(
// restore original data
if (group.thread_rank() < block_size) {
#pragma unroll
for (auto i = 0u; i < max_block_size; ++i) {
for (int i = 0; i < max_block_size; ++i) {
if (i < block_size) {
row[i] = work[i * stride + group.thread_rank()];
}
}
}

return succeeded && block_cond >= 1.0 &&
block_cond * float_traits<remove_complex<ValueType>>::eps < 1e-3;
block_cond * static_cast<remove_complex<ValueType>>(
float_traits<remove_complex<ValueType>>::eps) <
remove_complex<ValueType>{1e-3};
}


Expand Down
3 changes: 1 addition & 2 deletions common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/

template <int warps_per_block>
__global__
__launch_bounds__(warps_per_block* config::warp_size) void duplicate_array(
__launch_bounds__(default_num_warps* config::warp_size) void duplicate_array(
const precision_reduction* __restrict__ source, size_type source_size,
precision_reduction* __restrict__ dest, size_type dest_size)
{
Expand Down
3 changes: 0 additions & 3 deletions core/preconditioner/jacobi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,11 +333,9 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp* system_matrix,
} else {
auto csr_mtx = convert_to_with_sorting<csr_type>(exec, system_matrix,
skip_sorting);

if (parameters_.block_pointers.get_data() == nullptr) {
this->detect_blocks(csr_mtx.get());
}

const auto all_block_opt =
parameters_.storage_optimization.of_all_blocks;
auto& precisions = parameters_.storage_optimization.block_wise;
Expand All @@ -355,7 +353,6 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp* system_matrix,
precisions = std::move(tmp);
conditioning_.resize_and_reset(num_blocks_);
}

exec->run(jacobi::make_generate(
csr_mtx.get(), num_blocks_, parameters_.max_block_size,
parameters_.accuracy, storage_scheme_, conditioning_, precisions,
Expand Down
24 changes: 10 additions & 14 deletions core/preconditioner/jacobi_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,19 +89,15 @@ struct precision_reduction_descriptor {
static constexpr GKO_ATTRIBUTES uint32
singleton(const precision_reduction& pr)
{
return pr == precision_reduction(0, 0)
? p0n0
: pr == precision_reduction(0, 1)
? p0n1
: pr == precision_reduction(0, 2)
? p0n2
: pr == precision_reduction(1, 0)
? p1n0
: pr == precision_reduction(1, 1)
? p1n1
: pr == precision_reduction(2, 0)
? p2n0
: p0n0;
// clang-format off
return pr == precision_reduction(0, 0) ? p0n0
: pr == precision_reduction(0, 1) ? p0n1
: pr == precision_reduction(0, 2) ? p0n2
: pr == precision_reduction(1, 0) ? p1n0
: pr == precision_reduction(1, 1) ? p1n1
: pr == precision_reduction(2, 0) ? p2n0
: p0n0;
// clang-format on
}
};

Expand Down Expand Up @@ -141,7 +137,7 @@ GKO_ATTRIBUTES GKO_INLINE uint32 get_supported_storage_reductions(
using gko::detail::float_traits;
using type = remove_complex<ValueType>;
using prd = precision_reduction_descriptor;
auto accurate = [&cond, &accuracy](double eps) {
auto accurate = [&cond, &accuracy](type eps) {
return cond * eps < accuracy;
};
uint8 is_verified1 = 2;
Expand Down
2 changes: 1 addition & 1 deletion cuda/preconditioner/jacobi_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ void initialize_precisions(std::shared_ptr<const DefaultExecutor> exec,
default_grid_size,
static_cast<int32>(ceildiv(precisions.get_num_elems(), block_size)));
if (grid_size > 0) {
duplicate_array<default_num_warps><<<grid_size, block_size>>>(
duplicate_array<<<grid_size, block_size>>>(
source.get_const_data(), source.get_num_elems(),
precisions.get_data(), precisions.get_num_elems());
}
Expand Down
4 changes: 2 additions & 2 deletions dev_tools/scripts/format_header.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ CLANG_FORMAT=${CLANG_FORMAT:="clang-format"}

convert_header () {
local regex="^(#include )(<|\")(.*)(\"|>)$"
local jacobi_regex="^(cuda|hip)\/preconditioner\/jacobi_common(\.hip)?\.hpp"
local jacobi_regex="^(cuda|hip|dpcpp)\/preconditioner\/jacobi_common(\.hip)?\.hpp"
if [[ $@ =~ ${regex} ]]; then
header_file="${BASH_REMATCH[3]}"
if [ -f "${header_file}" ]; then
Expand All @@ -15,7 +15,7 @@ convert_header () {
fi
elif [ "${header_file}" = "matrices/config.hpp" ]; then
echo "#include \"${header_file}\""
elif [[ "${header_file}" =~ ${jacobi_regex} ]]; then
elif [[ "${header_file}" =~ ${jacobi_regex} ]]; then
echo "#include \"${header_file}\""
else
echo "#include <${header_file}>"
Expand Down
28 changes: 28 additions & 0 deletions dpcpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,10 @@ target_sources(ginkgo_dpcpp
matrix/sparsity_csr_kernels.dp.cpp
multigrid/pgm_kernels.dp.cpp
preconditioner/isai_kernels.dp.cpp
preconditioner/jacobi_advanced_apply_kernel.dp.cpp
preconditioner/jacobi_generate_kernel.dp.cpp
preconditioner/jacobi_kernels.dp.cpp
preconditioner/jacobi_simple_apply_kernel.dp.cpp
reorder/rcm_kernels.dp.cpp
solver/gmres_kernels.dp.cpp
solver/cb_gmres_kernels.dp.cpp
Expand All @@ -54,6 +57,28 @@ target_sources(ginkgo_dpcpp
${GKO_UNIFIED_COMMON_SOURCES}
)

# TODO: adjust it when dpcpp jacobi supports more block size
set(GKO_DPCPP_JACOBI_BLOCK_SIZES 32)
set(GKO_DPCPP_JACOBI_SOURCES)
foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_DPCPP_JACOBI_BLOCK_SIZES)
configure_file(
preconditioner/jacobi_generate_instantiate.inc.dp.cpp
preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp)
configure_file(
preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp
preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp)
configure_file(
preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp
preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp)
list(APPEND GKO_DPCPP_JACOBI_SOURCES
${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp
${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp
${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.dp.cpp)
endforeach()
target_sources(ginkgo_dpcpp PRIVATE ${GKO_DPCPP_JACOBI_SOURCES})
string(REPLACE ";" "," GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE "${GKO_DPCPP_JACOBI_BLOCK_SIZES}")
configure_file(preconditioner/jacobi_common.hpp.in preconditioner/jacobi_common.hpp)

ginkgo_compile_features(ginkgo_dpcpp)
target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0)

Expand All @@ -72,6 +97,9 @@ if (BUILD_SHARED_LIBS)
else ()
target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel)
endif()
# include path for generated headers like jacobi_common.hpp
target_include_directories(ginkgo_dpcpp
PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..)
target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL)
if (GINKGO_DPCPP_SINGLE_MODE)
Expand Down
128 changes: 128 additions & 0 deletions dpcpp/components/diagonal_block_manipulation.dp.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*******************************<GINKGO LICENSE>******************************
Copyright (c) 2017-2022, the Ginkgo authors
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/

#ifndef GKO_DPCPP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_DP_HPP_
#define GKO_DPCPP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_DP_HPP_


#include <type_traits>


#include <CL/sycl.hpp>


#include "dpcpp/base/config.hpp"
#include "dpcpp/base/dpct.hpp"
#include "dpcpp/components/cooperative_groups.dp.hpp"


namespace gko {
namespace kernels {
namespace dpcpp {
namespace csr {


/**
* @internal
*
* @note assumes that block dimensions are in "standard format":
* (subwarp_size, config::warp_size / subwarp_size, z)
*/
template <
int max_block_size, int warps_per_block, typename Group, typename ValueType,
typename IndexType,
typename = std::enable_if_t<group::is_synchronizable_group<Group>::value>>
__dpct_inline__ void extract_transposed_diag_blocks(
const Group& group, int processed_blocks,
const IndexType* __restrict__ row_ptrs,
const IndexType* __restrict__ col_idxs,
const ValueType* __restrict__ values,
const IndexType* __restrict__ block_ptrs, size_type num_blocks,
ValueType* __restrict__ block_row, int increment,
ValueType* __restrict__ workspace, sycl::nd_item<3> item_ct1)
{
const int tid =
item_ct1.get_local_id(1) * item_ct1.get_local_range().get(2) +
item_ct1.get_local_id(2);
const auto warp = group::tiled_partition<config::warp_size>(group);
auto bid = static_cast<size_type>(item_ct1.get_group(2)) * warps_per_block *
processed_blocks +
item_ct1.get_local_id(0) * processed_blocks;
auto bstart = (bid < num_blocks) ? block_ptrs[bid] : zero<IndexType>();
IndexType bsize = 0;
#pragma unroll
for (int b = 0; b < processed_blocks; ++b, ++bid) {
if (bid < num_blocks) {
bstart += bsize;
bsize = block_ptrs[bid + 1] - bstart;
#pragma unroll
for (int i = 0; i < max_block_size; ++i) {
if (i < bsize) {
if (item_ct1.get_local_id(1) == b &&
item_ct1.get_local_id(2) < max_block_size) {
workspace[item_ct1.get_local_id(2)] = zero<ValueType>();
}
warp.sync();
const auto row = bstart + i;
const auto rstart = row_ptrs[row] + tid;
const auto rend = row_ptrs[row + 1];
// use the entire warp to ensure coalesced memory access
for (auto j = rstart; j < rend; j += config::warp_size) {
const auto col = col_idxs[j] - bstart;
if (col >= bsize) {
break;
}
if (col >= 0) {
workspace[col] = values[j];
}
}
warp.sync();
if (item_ct1.get_local_id(1) == b &&
item_ct1.get_local_id(2) < bsize) {
block_row[i * increment] =
workspace[item_ct1.get_local_id(2)];
}
warp.sync();
}
}
}
}
}


} // namespace csr
} // namespace dpcpp
} // namespace kernels
} // namespace gko


#endif // GKO_DPCPP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_DP_HPP_
1 change: 0 additions & 1 deletion dpcpp/components/warp_blas.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include <cassert>
// #include <dpcpp/base/math.hpp>
#include <type_traits>


Expand Down
Loading

0 comments on commit cff9742

Please sign in to comment.