Skip to content

Commit

Permalink
Improve crs/bsr sorting performance (kokkos#2293)
Browse files Browse the repository at this point in the history
* CRS sorting improvements

- Wrote bulk sort/permutation based sorting for CRS graph, matrix, and
  BSR matrix (bulk = one large sort of all the entries, using row-major
  dense index as keys)
  - This is more performant for imbalanced entries per row
- If matrix dimensions are too large to do bulk sort, fall back to
  sorting within each row with a thread.

* Add perf test for sort_crs_matrix
* sort_crs: improve parallel labels
* Work around kokkos issue 7036
* sort_crs: replace radix sort lambda with functor
(Lambda segfaults with nvcc+openmp)
---------
Signed-off-by: Brian Kelley <bmkelle@sandia.gov>
  • Loading branch information
brian-kelley authored and ndellingwood committed Sep 4, 2024
1 parent 23ea213 commit e912091
Show file tree
Hide file tree
Showing 7 changed files with 767 additions and 477 deletions.
20 changes: 15 additions & 5 deletions common/src/KokkosKernels_SimpleUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -358,13 +358,19 @@ struct ReduceMaxFunctor {
};

template <typename view_type, typename MyExecSpace>
void kk_view_reduce_max(size_t num_elements, view_type view_to_reduce,
void kk_view_reduce_max(const MyExecSpace &exec, size_t num_elements, view_type view_to_reduce,
typename view_type::non_const_value_type &max_reduction) {
typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
Kokkos::parallel_reduce("KokkosKernels::Common::ReduceMax", my_exec_space(0, num_elements),
typedef Kokkos::RangePolicy<MyExecSpace> policy_t;
Kokkos::parallel_reduce("KokkosKernels::Common::ReduceMax", policy_t(exec, 0, num_elements),
ReduceMaxFunctor<view_type>(view_to_reduce), max_reduction);
}

template <typename view_type, typename MyExecSpace>
void kk_view_reduce_max(size_t num_elements, view_type view_to_reduce,
typename view_type::non_const_value_type &max_reduction) {
kk_view_reduce_max(MyExecSpace(), num_elements, view_to_reduce, max_reduction);
}

// xorshift hash/pseudorandom function (supported for 32- and 64-bit integer
// types only)
template <typename Value>
Expand Down Expand Up @@ -429,10 +435,14 @@ struct SequentialFillFunctor {
val_type start;
};

template <typename ExecSpace, typename V>
void sequential_fill(const ExecSpace &exec, const V &v, typename V::non_const_value_type start = 0) {
Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(exec, 0, v.extent(0)), SequentialFillFunctor<V>(v, start));
}

template <typename V>
void sequential_fill(const V &v, typename V::non_const_value_type start = 0) {
Kokkos::parallel_for(Kokkos::RangePolicy<typename V::execution_space>(0, v.extent(0)),
SequentialFillFunctor<V>(v, start));
sequential_fill(typename V::execution_space(), v, start);
}

} // namespace Impl
Expand Down
6 changes: 6 additions & 0 deletions common/src/KokkosKernels_Utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1076,6 +1076,12 @@ void view_reduce_max(size_t num_elements, view_type view_to_reduce,
kk_view_reduce_max<view_type, MyExecSpace>(num_elements, view_to_reduce, max_reduction);
}

template <typename view_type, typename MyExecSpace>
void view_reduce_max(const MyExecSpace &exec, size_t num_elements, view_type view_to_reduce,
typename view_type::non_const_value_type &max_reduction) {
kk_view_reduce_max<view_type, MyExecSpace>(exec, num_elements, view_to_reduce, max_reduction);
}

template <typename size_type>
struct ReduceRowSizeFunctor {
const size_type *rowmap_view_begins;
Expand Down
9 changes: 9 additions & 0 deletions perf_test/sparse/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,15 @@ KOKKOSKERNELS_ADD_EXECUTABLE(
SOURCES KokkosSparse_mdf.cpp
)

# For the sake of build times, don't build this CRS sorting perf test by default.
# It can be enabled if needed by setting -DKokkosKernels_ENABLE_SORT_CRS_PERFTEST=ON.
if (KokkosKernels_ENABLE_SORT_CRS_PERFTEST)
KOKKOSKERNELS_ADD_EXECUTABLE(
sparse_sort_crs
SOURCES KokkosSparse_sort_crs.cpp
)
endif ()

if (KokkosKernels_ENABLE_BENCHMARK)
KOKKOSKERNELS_ADD_BENCHMARK(
sparse_par_ilut
Expand Down
103 changes: 103 additions & 0 deletions perf_test/sparse/KokkosSparse_sort_crs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#include <iostream>
#include <algorithm>
#include "KokkosKernels_config.h"
#include "KokkosSparse_IOUtils.hpp"
#include "KokkosKernels_perf_test_utilities.hpp"

#include "KokkosSparse_CrsMatrix.hpp"
#include "KokkosSparse_SortCrs.hpp"

using perf_test::CommonInputParams;

struct LocalParams {
std::string mtxFile;
};

void print_options() {
std::cerr << "Options\n" << std::endl;

std::cerr << perf_test::list_common_options();

std::cerr << "\t[Required] --mtx <path> :: matrix to sort\n";
std::cerr << "\t[Optional] --repeat :: how many times to repeat sorting\n";
}

int parse_inputs(LocalParams& params, int argc, char** argv) {
for (int i = 1; i < argc; ++i) {
if (perf_test::check_arg_str(i, argc, argv, "--mtx", params.mtxFile)) {
++i;
} else {
std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl;
print_options();
return 1;
}
}
return 0;
}

template <typename exec_space>
void run_experiment(int argc, char** argv, const CommonInputParams& common_params) {
using namespace KokkosSparse;

using mem_space = typename exec_space::memory_space;
using device_t = typename Kokkos::Device<exec_space, mem_space>;
using size_type = default_size_type;
using lno_t = default_lno_t;
using scalar_t = default_scalar;
using crsMat_t = KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;

using graph_t = typename crsMat_t::StaticCrsGraphType;

LocalParams params;
if (parse_inputs(params, argc, argv)) return;

crsMat_t A = KokkosSparse::Impl::read_kokkos_crst_matrix<crsMat_t>(params.mtxFile.c_str());
std::cout << "Loaded matrix: " << A.numRows() << "x" << A.numCols() << " with " << A.nnz() << " entries.\n";
// This first sort call serves as a warm-up
KokkosSparse::sort_crs_matrix(A);
lno_t m = A.numRows();
lno_t n = A.numCols();
auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
typename crsMat_t::index_type shuffledEntries("shuffled entries", A.nnz());
// Randomly shuffle the entries within each row, so that the rows aren't
// already sorted. Leave the values alone; this changes the matrix numerically
// but this doesn't affect sorting.
for (lno_t i = 0; i < m; i++) {
std::random_shuffle(entriesHost.data() + i, entriesHost.data() + i + 1);
}
Kokkos::deep_copy(shuffledEntries, entriesHost);
exec_space exec;
Kokkos::Timer timer;
double totalTime = 0;
for (int rep = 0; rep < common_params.repeat; rep++) {
Kokkos::deep_copy(exec, A.graph.entries, shuffledEntries);
exec.fence();
timer.reset();
KokkosSparse::sort_crs_matrix(exec, A);
exec.fence();
totalTime += timer.seconds();
}
std::cout << "Mean sort_crs_matrix time over " << common_params.repeat << " trials: ";
std::cout << totalTime / common_params.repeat << "\n";
}

#define KOKKOSKERNELS_PERF_TEST_NAME run_experiment
#include "KokkosKernels_perf_test_instantiation.hpp"
int main(int argc, char** argv) { return main_instantiation(argc, argv); } // main
Loading

0 comments on commit e912091

Please sign in to comment.