rapidsai · rapids-bot · Mar 10, 2022 · Oct 27, 2021 · Nov 4, 2021 · Nov 5, 2021
@@ -372,6 +372,13 @@ add_library(
   src/reductions/scan/scan.cpp
   src/reductions/scan/scan_exclusive.cu
   src/reductions/scan/scan_inclusive.cu
+  src/reductions/segmented_all.cu
+  src/reductions/segmented_any.cu
+  src/reductions/segmented_max.cu
+  src/reductions/segmented_min.cu
+  src/reductions/segmented_product.cu
+  src/reductions/segmented_reductions.cpp
+  src/reductions/segmented_sum.cu
   src/reductions/std.cu
   src/reductions/sum.cu
   src/reductions/sum_of_squares.cu

@@ -24,7 +24,7 @@ target_compile_options(
 
 target_link_libraries(
   cudf_datagen PUBLIC GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main
-                      benchmark::benchmark nvbench::nvbench Threads::Threads cudf
+                      benchmark::benchmark nvbench::nvbench Threads::Threads cudf cudftestutil
 )
 
 target_include_directories(
@@ -169,9 +169,10 @@ ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher.cu)
 # ##################################################################################################
 # * reduction benchmark ---------------------------------------------------------------------------
 ConfigureBench(
-  REDUCTION_BENCH reduction/anyall.cpp reduction/dictionary.cpp reduction/reduce.cpp
-  reduction/scan.cpp reduction/minmax.cpp
+  REDUCTION_BENCH reduction/anyall.cpp reduction/dictionary.cpp reduction/minmax.cpp
+  reduction/reduce.cpp reduction/scan.cpp
 )
+ConfigureNVBench(REDUCTION_NVBENCH reduction/segment_reduce.cu)
 
 # ##################################################################################################
 # * reduction benchmark ---------------------------------------------------------------------------

@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+#include <nvbench/nvbench.cuh>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/reduction.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/device_vector.h>
+
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+namespace cudf {
+
+bool constexpr is_boolean_output_agg(aggregation::Kind kind)
+{
+  return kind == aggregation::ALL || kind == aggregation::ANY;
+}
+
+template <aggregation::Kind kind>
+std::unique_ptr<aggregation> make_simple_aggregation()
+{
+  switch (kind) {
+    case aggregation::SUM: return make_sum_aggregation();
+    case aggregation::PRODUCT: return make_product_aggregation();
+    case aggregation::MIN: return make_min_aggregation();
+    case aggregation::MAX: return make_max_aggregation();
+    case aggregation::ALL: return make_all_aggregation();
+    case aggregation::ANY: return make_any_aggregation();
+    default: CUDF_FAIL("Unsupported simple segmented aggregation");
+  }
+}
+
+template <typename InputType>
+std::pair<std::unique_ptr<column>, thrust::device_vector<size_type>> make_test_data(
+  nvbench::state& state)
+{
+  auto const column_size{size_type(state.get_int64("column_size"))};
+  auto const num_segments{size_type(state.get_int64("num_segments"))};
+
+  auto segment_length = column_size / num_segments;
+
+  test::UniformRandomGenerator<InputType> rand_gen(0, 100);
+  auto data_it = detail::make_counting_transform_iterator(
+    0, [&rand_gen](auto i) { return rand_gen.generate(); });
+
+  auto offset_it =
+    detail::make_counting_transform_iterator(0, [&column_size, &segment_length](auto i) {
+      return column_size < i * segment_length ? column_size : i * segment_length;
+    });
+
+  test::fixed_width_column_wrapper<InputType> input(data_it, data_it + column_size);
+  std::vector<size_type> h_offsets(offset_it, offset_it + num_segments + 1);
+  thrust::device_vector<size_type> d_offsets(h_offsets);
+
+  return std::make_pair(input.release(), d_offsets);
+}
+
+template <typename InputType, typename OutputType, aggregation::Kind kind>
+std::enable_if_t<!is_boolean_output_agg(kind) || std::is_same_v<OutputType, bool>, void>
+BM_Simple_Segmented_Reduction(nvbench::state& state,
+                              nvbench::type_list<InputType, OutputType, nvbench::enum_type<kind>>)
+{
+  // TODO: to be replaced by nvbench fixture once it's ready
+  cudf::rmm_pool_raii rmm_pool;
+
+  auto const column_size{size_type(state.get_int64("column_size"))};
+  auto [input, offsets] = make_test_data<InputType>(state);
+  auto agg              = make_simple_aggregation<kind>();
+
+  state.add_element_count(column_size);
+  state.add_global_memory_reads<InputType>(column_size);
+  state.add_global_memory_writes<OutputType>(column_size);
+
+  state.exec(
+    nvbench::exec_tag::sync,
+    [input_view = input->view(), offset_span = device_span<size_type>{offsets}, &agg](
+      nvbench::launch& launch) {
+      segmented_reduce(
+        input_view, offset_span, *agg, data_type{type_to_id<OutputType>()}, null_policy::INCLUDE);
+    });
+}
+
+template <typename InputType, typename OutputType, aggregation::Kind kind>
+std::enable_if_t<is_boolean_output_agg(kind) && !std::is_same_v<OutputType, bool>, void>
+BM_Simple_Segmented_Reduction(nvbench::state& state,
+                              nvbench::type_list<InputType, OutputType, nvbench::enum_type<kind>>)
+{
+  state.skip("Invalid combination of dtype and aggregation type.");
+}
+
+using Types = nvbench::type_list<bool, int32_t, float, double>;
+// Skip benchmarking MAX/ANY since they are covered by MIN/ALL respectively.
+using AggKinds = nvbench::
+  enum_type_list<aggregation::SUM, aggregation::PRODUCT, aggregation::MIN, aggregation::ALL>;
+
+NVBENCH_BENCH_TYPES(BM_Simple_Segmented_Reduction, NVBENCH_TYPE_AXES(Types, Types, AggKinds))
+  .set_name("segmented_reduction_simple")
+  .set_type_axes_names({"InputType", "OutputType", "AggregationKinds"})
+  .add_int64_axis("column_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
+  .add_int64_axis("num_segments", {1'000, 10'000, 100'000});
+
+}  // namespace cudf
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/valid_if.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
@@ -279,7 +280,8 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
                                                     OffsetIterator first_bit_indices_end,
                                                     OffsetIterator last_bit_indices_begin,
                                                     count_bits_policy count_bits,
-                                                    rmm::cuda_stream_view stream)
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
 {
   auto const num_ranges =
     static_cast<size_type>(std::distance(first_bit_indices_begin, first_bit_indices_end));
@@ -329,14 +331,15 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
     // set bits from the length of the segment.
     auto segments_begin =
       thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin);
-    auto segments_size = thrust::transform_iterator(segments_begin, [] __device__(auto segment) {
-      auto const begin = thrust::get<0>(segment);
-      auto const end   = thrust::get<1>(segment);
-      return end - begin;
-    });
+    auto segment_length_iterator =
+      thrust::transform_iterator(segments_begin, [] __device__(auto const& segment) {
+        auto const begin = thrust::get<0>(segment);
+        auto const end   = thrust::get<1>(segment);
+        return end - begin;
+      });
     thrust::transform(rmm::exec_policy(stream),
-                      segments_size,
-                      segments_size + num_ranges,
+                      segment_length_iterator,
+                      segment_length_iterator + num_ranges,
                       d_bit_counts.data(),
                       d_bit_counts.data(),
                       [] __device__(auto segment_size, auto segment_bit_count) {
@@ -438,7 +441,8 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
                                        first_bit_indices_end,
                                        last_bit_indices_begin,
                                        count_bits,
-                                       stream);
+                                       stream,
+                                       rmm::mr::get_current_device_resource());
 
   // Copy the results back to the host.
   return make_std_vector_sync(d_bit_counts, stream);
@@ -501,6 +505,110 @@ std::vector<size_type> segmented_null_count(bitmask_type const* bitmask,
   return detail::segmented_count_unset_bits(bitmask, indices_begin, indices_end, stream);
 }
 
+/**
+ * @brief Reduce an input null mask using segments defined by offset indices
+ * into an output null mask.
+ *
+ * @tparam OffsetIterator Random-access input iterator type.
+ * @param bitmask Null mask residing in device memory whose segments will be
+ * reduced into a new mask.
+ * @param first_bit_indices_begin Random-access input iterator to the beginning
+ * of a sequence of indices of the first bit in each segment (inclusive).
+ * @param first_bit_indices_end Random-access input iterator to the end of a
+ * sequence of indices of the first bit in each segment (inclusive).
+ * @param last_bit_indices_begin Random-access input iterator to the beginning
+ * of a sequence of indices of the last bit in each segment (exclusive).
+ * @param null_handling If `null_policy::INCLUDE`, all elements in a segment
+ * must be valid for the reduced value to be valid. If `null_policy::EXCLUDE`,
+ * the reduction is valid if any element in the segment is valid.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned buffer's device memory.
+ * @return A pair containing the reduced null mask and number of nulls.
+ */
+template <typename OffsetIterator>
+std::pair<rmm::device_buffer, size_type> segmented_null_mask_reduction(
+  bitmask_type const* bitmask,
+  OffsetIterator first_bit_indices_begin,
+  OffsetIterator first_bit_indices_end,
+  OffsetIterator last_bit_indices_begin,
+  null_policy null_handling,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto const segments_begin =
+    thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin);
+  auto const segment_length_iterator =
+    thrust::make_transform_iterator(segments_begin, [] __device__(auto const& segment) {
+      auto const begin = thrust::get<0>(segment);
+      auto const end   = thrust::get<1>(segment);
+      return end - begin;
+    });
+
+  // Empty segments are always null in the output mask
+  auto const num_segments =
+    static_cast<size_type>(std::distance(first_bit_indices_begin, first_bit_indices_end));
+  auto [output_null_mask, output_null_count] = cudf::detail::valid_if(
+    segment_length_iterator,
+    segment_length_iterator + num_segments,
+    [] __device__(auto const& len) { return len > 0; },
+    stream,
+    mr);
+
+  if (bitmask != nullptr) {
+    [[maybe_unused]] auto const [null_policy_bitmask, _] = [&]() {
+      if (null_handling == null_policy::EXCLUDE) {
+        // Output null mask should be valid if any element in the segment is
+        // valid and the segment is non-empty.
+        auto const valid_counts =
+          cudf::detail::segmented_count_bits(bitmask,
+                                             first_bit_indices_begin,
+                                             first_bit_indices_end,
+                                             last_bit_indices_begin,
+                                             cudf::detail::count_bits_policy::SET_BITS,
+                                             stream,
+                                             rmm::mr::get_current_device_resource());
+        return cudf::detail::valid_if(
+          valid_counts.begin(),
+          valid_counts.end(),
+          [] __device__(auto const valid_count) { return valid_count > 0; },
+          stream);
+      } else {
+        // Output null mask should be valid if all elements in the segment are
+        // valid and the segment is non-empty.
+        auto const null_counts =
+          cudf::detail::segmented_count_bits(bitmask,
+                                             first_bit_indices_begin,
+                                             first_bit_indices_end,
+                                             last_bit_indices_begin,
+                                             cudf::detail::count_bits_policy::UNSET_BITS,
+                                             stream,
+                                             rmm::mr::get_current_device_resource());
+        return cudf::detail::valid_if(
+          null_counts.begin(),
+          null_counts.end(),
+          [] __device__(auto const null_count) { return null_count == 0; },
+          stream);
+      }
+    }();
+
+    std::vector<bitmask_type const*> masks{
+      reinterpret_cast<bitmask_type const*>(output_null_mask.data()),
+      reinterpret_cast<bitmask_type const*>(null_policy_bitmask.data())};
+    std::vector<size_type> begin_bits{0, 0};
+    size_type valid_count = cudf::detail::inplace_bitmask_and(
+      device_span<bitmask_type>(reinterpret_cast<bitmask_type*>(output_null_mask.data()),
+                                num_bitmask_words(num_segments)),
+      masks,
+      begin_bits,
+      num_segments,
+      stream,
+      mr);
+
+    output_null_count = num_segments - valid_count;
+  }
+  return std::make_pair(std::move(output_null_mask), output_null_count);
+}
+
 }  // namespace detail
 
 }  // namespace cudf