Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor histogram reduction using cuco::static_set::insert_and_find #16485

Draft
wants to merge 16 commits into
base: branch-24.12
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,8 @@ ConfigureBench(
reduction/reduce.cpp reduction/scan.cpp
)
ConfigureNVBench(
REDUCTION_NVBENCH reduction/rank.cpp reduction/scan_structs.cpp reduction/segmented_reduce.cpp
REDUCTION_NVBENCH reduction/histogram.cpp reduction/rank.cpp reduction/scan_structs.cpp
reduction/segmented_reduce.cpp
)

# ##################################################################################################
Expand All @@ -231,7 +232,7 @@ ConfigureBench(
)

ConfigureNVBench(
GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp
GROUPBY_NVBENCH groupby/group_histogram.cpp groupby/group_max.cpp groupby/group_max_multithreaded.cpp
groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
)

Expand Down
86 changes: 86 additions & 0 deletions cpp/benchmarks/groupby/group_histogram.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>

#include <cudf/groupby.hpp>

#include <nvbench/nvbench.cuh>

template <typename Type>
void groupby_histogram_helper(nvbench::state& state,
cudf::size_type num_rows,
cudf::size_type cardinality,
double null_probability)
{
auto const keys = [&] {
data_profile const profile =
data_profile_builder()
.cardinality(cardinality)
.no_validity()
.distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
}();

auto const vals = [&] {
auto builder = data_profile_builder().cardinality(0).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
if (null_probability > 0) {
builder.null_probability(null_probability);
} else {
builder.no_validity();
}
return create_random_column(
cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
}();

auto keys_view = keys->view();
auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view}));

std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
requests[0].values = vals->view();
requests[0].aggregations.push_back(cudf::make_histogram_aggregation<cudf::groupby_aggregation>());

auto const mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
auto const result = gb_obj.aggregate(requests);
});
auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
}

template <typename Type>
void bench_groupby_histogram(nvbench::state& state, nvbench::type_list<Type>)
{
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const null_probability = state.get_float64("null_probability");

groupby_histogram_helper<Type>(state, num_rows, cardinality, null_probability);
}

NVBENCH_BENCH_TYPES(bench_groupby_histogram,
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
.set_name("groupby_histogram")
.add_float64_axis("null_probability", {0, 0.1, 0.9})
.add_int64_axis("cardinality", {10, 20, 50, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000})
.add_int64_power_of_two_axis("num_rows", {12, 18, 24});

74 changes: 74 additions & 0 deletions cpp/benchmarks/reduction/histogram.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "cudf/aggregation.hpp"
#include "cudf/detail/aggregation/aggregation.hpp"

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/nvbench_utilities.hpp>
#include <benchmarks/common/table_utilities.hpp>

#include <cudf/column/column_view.hpp>
#include <cudf/detail/aggregation/aggregation.hpp>
#include <cudf/reduction.hpp>
#include <cudf/reduction/detail/histogram.hpp>
#include <cudf/types.hpp>

#include <nvbench/nvbench.cuh>

template <typename type>
static void nvbench_reduction_histogram(nvbench::state& state, nvbench::type_list<type>)
{
auto const dtype = cudf::type_to_id<type>();

auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const null_probability = state.get_float64("null_probability");

if (cardinality > num_rows) {
state.skip("cardinality > num_rows");
return;
}

data_profile const profile = data_profile_builder()
.null_probability(null_probability)
.cardinality(cardinality)
.distribution(dtype, distribution_id::UNIFORM, 0, num_rows);

auto const input = create_random_column(dtype, row_count{num_rows}, profile);
auto agg = cudf::make_histogram_aggregation<cudf::reduce_aggregation>();
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
auto result = cudf::reduce(*input, *agg, input->type(), stream_view);
});

state.add_element_count(input->size());
}

using data_type = nvbench::type_list<int32_t, int64_t>;

NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type))
.set_name("histogram")
.add_float64_axis("null_probability", {0.1})
.add_int64_axis("cardinality", {1})
.add_int64_axis("num_rows",
{
10000, // 10k
100000, // 100k
1000000, // 1M
10000000, // 10M
100000000, // 100M
srinivasyadav18 marked this conversation as resolved.
Show resolved Hide resolved
});
2 changes: 1 addition & 1 deletion cpp/include/cudf/reduction/detail/histogram.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace cudf::reduction::detail {
* @param partial_counts An optional column containing count for each row
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate memory of the returned objects
* @return A pair of array contains the (stable-order) indices of the distinct rows in the input
* @return A pair of array contains the indices of the distinct rows in the input
* table, and their corresponding distinct counts
*/
[[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/groupby/sort/group_histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <rmm/resource_ref.hpp>

#include <thrust/gather.h>
#include <thrust/sort.h>

namespace cudf::groupby::detail {

Expand Down Expand Up @@ -57,6 +58,12 @@ std::unique_ptr<column> build_histogram(column_view const& values,
auto [distinct_indices, distinct_counts] =
cudf::reduction::detail::compute_row_frequencies(labeled_values, partial_counts, stream, mr);

// compute_row_frequencies does not provide stable ordering
thrust::sort_by_key(rmm::exec_policy(stream),
distinct_indices->begin(),
distinct_indices->end(),
distinct_counts->mutable_view().begin<int64_t>());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will this sort impact the groupby histogram performance?

Copy link
Contributor Author

@srinivasyadav18 srinivasyadav18 Aug 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is slight regression around 5% on small input ( with 100'000) , but overall improvement hides this extra sort_by_key . The overall speed up is upto 30% as the input size increases, even with this extra sorting step.


// Gather the distinct rows for the output histogram.
auto out_table = cudf::detail::gather(labeled_values,
*distinct_indices,
Expand Down
Loading
Loading