From f20491d3366808e5c62dcee2160fc8a9d5e50fa7 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 30 Sep 2024 12:14:32 -0400
Subject: [PATCH 1/9] Parse newline as whitespace character while tokenizing
 JSONL inputs with non-newline delimiter (#16950)

Backporting PR #16923: : Parse newline as whitespace character while
tokenizing JSONL inputs

Addresses #16915
---
 cpp/src/io/json/nested_json_gpu.cu     |   8 +-
 cpp/tests/io/json/json_test.cpp        |  24 ++++
 cpp/tests/io/json/nested_json_test.cpp | 178 +++++++++++++++++++++++++
 3 files changed, 207 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 1c15e147b13..76816071d8c 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -618,12 +618,14 @@ struct PdaSymbolToSymbolGroupId {
     constexpr auto pda_sgid_lookup_size =
       static_cast<int32_t>(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0]));
     // We map the delimiter character to LINE_BREAK symbol group id, and the newline character
-    // to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
+    // to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
     // escape, comma, colon or whitespace characters.
+    auto constexpr newline    = '\n';
+    auto constexpr whitespace = ' ';
     auto const symbol_position =
       symbol == delimiter
-        ? static_cast<int32_t>('\n')
-        : (symbol == '\n' ? static_cast<int32_t>(delimiter) : static_cast<int32_t>(symbol));
+        ? static_cast<int32_t>(newline)
+        : (symbol == newline ? static_cast<int32_t>(whitespace) : static_cast<int32_t>(symbol));
     PdaSymbolGroupIdT symbol_gid =
       tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)];
     return stack_idx * static_cast<PdaSymbolGroupIdT>(symbol_group_id::NUM_PDA_INPUT_SGS) +
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 68ec255b39d..a094ac7d772 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2575,6 +2575,30 @@ TEST_F(JsonReaderTest, ViableDelimiter)
   EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument);
 }
 
+TEST_F(JsonReaderTest, ViableDelimiterNewlineWS)
+{
+  // Test input
+  std::string input = R"({"a":
+  100})";
+
+  cudf::io::json_reader_options json_parser_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()})
+      .lines(true)
+      .delimiter('\0');
+
+  auto result = cudf::io::read_json(json_parser_options);
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 1);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+
+  auto col1_iterator = thrust::constant_iterator<int64_t>(100);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper(col1_iterator, col1_iterator + 1));
+}
+
 // Test case for dtype prune:
 // all paths, only one.
 // one present, another not present, nothing present
diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp
index 327169ae563..f32aba0e632 100644
--- a/cpp/tests/io/json/nested_json_test.cpp
+++ b/cpp/tests/io/json/nested_json_test.cpp
@@ -29,6 +29,7 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -1196,4 +1197,181 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter)
   }
 }
 
+TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter)
+{
+  // Test input. Inline comments used to indicate character indexes
+  //                           012345678 <= line 0
+  char const delimiter = GetParam();
+
+  /* Input: (Note that \n is considered whitespace according to the JSON spec when it is not used as
+   * a delimiter for JSONL)
+   * {"a":2}
+   * {"a":<delimiter>{"a":{"a":[321<delimiter>{"a":[1]}
+   *
+   * <delimiter>{"b":123}
+   * {"b":123}<delimiter>
+   * {"b"\n:\n\n\n123\n}
+   */
+  std::string input = R"({"a":2})"
+                      "\n";
+  // starting position 8 (zero indexed)
+  input += R"({"a":)" + std::string(1, delimiter);
+  // starting position 14 (zero indexed)
+  input += R"({"a":{"a":[321)" + std::string(1, delimiter);
+  // starting position 29 (zero indexed)
+  input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter);
+  // starting position 41 (zero indexed)
+  input += R"({"b":123})"
+           "\n";
+  // starting position 51 (zero indexed)
+  input += R"({"b":123})" + std::string(1, delimiter);
+  // starting position 61 (zero indexed)
+  input += R"({"b")" + std::string("\n:\n\n\n123\n}");
+
+  // Golden token stream sample
+  using token_t = cuio_json::token_t;
+  std::vector<std::pair<std::size_t, cuio_json::PdaTokenT>> golden_token_stream;
+  if (delimiter != '\n') {
+    golden_token_stream = {// Line 0 (valid)
+                           {0, token_t::StructBegin},
+                           {1, token_t::StructMemberBegin},
+                           {1, token_t::FieldNameBegin},
+                           {3, token_t::FieldNameEnd},
+                           {5, token_t::ValueBegin},
+                           {6, token_t::ValueEnd},
+                           {6, token_t::StructMemberEnd},
+                           {6, token_t::StructEnd},
+                           // Line 1 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 2 (valid)
+                           {29, token_t::StructBegin},
+                           {30, token_t::StructMemberBegin},
+                           {30, token_t::FieldNameBegin},
+                           {32, token_t::FieldNameEnd},
+                           {34, token_t::ListBegin},
+                           {35, token_t::ValueBegin},
+                           {36, token_t::ValueEnd},
+                           {36, token_t::ListEnd},
+                           {37, token_t::StructMemberEnd},
+                           {37, token_t::StructEnd},
+                           // Line 3 (valid)
+                           {41, token_t::StructBegin},
+                           {42, token_t::StructMemberBegin},
+                           {42, token_t::FieldNameBegin},
+                           {44, token_t::FieldNameEnd},
+                           {46, token_t::ValueBegin},
+                           {49, token_t::ValueEnd},
+                           {49, token_t::StructMemberEnd},
+                           {49, token_t::StructEnd},
+                           // Line 4 (valid)
+                           {61, token_t::StructBegin},
+                           {62, token_t::StructMemberBegin},
+                           {62, token_t::FieldNameBegin},
+                           {64, token_t::FieldNameEnd},
+                           {70, token_t::ValueBegin},
+                           {73, token_t::ValueEnd},
+                           {74, token_t::StructMemberEnd},
+                           {74, token_t::StructEnd}};
+  } else {
+    /* Input:
+     * {"a":2}
+     * {"a":
+     * {"a":{"a":[321
+     * {"a":[1]}
+     *
+     *
+     * {"b":123}
+     * {"b":123}
+     * {"b"\n:\n\n\n123\n}
+     */
+    golden_token_stream = {// Line 0 (valid)
+                           {0, token_t::StructBegin},
+                           {1, token_t::StructMemberBegin},
+                           {1, token_t::FieldNameBegin},
+                           {3, token_t::FieldNameEnd},
+                           {5, token_t::ValueBegin},
+                           {6, token_t::ValueEnd},
+                           {6, token_t::StructMemberEnd},
+                           {6, token_t::StructEnd},
+                           // Line 1 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 2 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 3 (valid)
+                           {29, token_t::StructBegin},
+                           {30, token_t::StructMemberBegin},
+                           {30, token_t::FieldNameBegin},
+                           {32, token_t::FieldNameEnd},
+                           {34, token_t::ListBegin},
+                           {35, token_t::ValueBegin},
+                           {36, token_t::ValueEnd},
+                           {36, token_t::ListEnd},
+                           {37, token_t::StructMemberEnd},
+                           {37, token_t::StructEnd},
+                           // Line 4 (valid)
+                           {41, token_t::StructBegin},
+                           {42, token_t::StructMemberBegin},
+                           {42, token_t::FieldNameBegin},
+                           {44, token_t::FieldNameEnd},
+                           {46, token_t::ValueBegin},
+                           {49, token_t::ValueEnd},
+                           {49, token_t::StructMemberEnd},
+                           {49, token_t::StructEnd},
+                           // Line 5 (valid)
+                           {51, token_t::StructBegin},
+                           {52, token_t::StructMemberBegin},
+                           {52, token_t::FieldNameBegin},
+                           {54, token_t::FieldNameEnd},
+                           {56, token_t::ValueBegin},
+                           {59, token_t::ValueEnd},
+                           {59, token_t::StructMemberEnd},
+                           {59, token_t::StructEnd},
+                           // Line 6 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd}};
+  }
+
+  auto const stream = cudf::get_default_stream();
+
+  // Prepare input & output buffers
+  cudf::string_scalar const d_scalar(input, true, stream);
+  auto const d_input = cudf::device_span<cuio_json::SymbolT const>{
+    d_scalar.data(), static_cast<size_t>(d_scalar.size())};
+
+  // Default parsing options
+  cudf::io::json_reader_options const in_opts =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{})
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+      .delimiter(delimiter)
+      .lines(true);
+
+  // Parse the JSON and get the token stream
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, in_opts, stream, cudf::get_current_device_resource_ref());
+  // Copy back the number of tokens that were written
+  auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
+  auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
+
+  stream.synchronize();
+  // Verify the number of tokens matches
+  ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size());
+  ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size());
+
+  for (std::size_t i = 0; i < tokens_gpu.size(); i++) {
+    // Ensure the index the tokens are pointing to do match
+    EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i;
+    // Ensure the token category is correct
+    EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i;
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 194d5f47dc2174fb4aa1e3d3faf092c9022d765c Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Tue, 1 Oct 2024 10:07:31 -0700
Subject: [PATCH 2/9] Add a shortcut for when the input clusters are all empty
 for the tdigest merge (#16897)

Fixes https://github.com/rapidsai/cudf/issues/16881. This is a new attempt to fix it.

Previously in https://github.com/rapidsai/cudf/pull/16675, I flipped the `has_nulls` flag to true as I thought that empty clusters should be explicitly stored in the offsets and handled properly. It turns out that it was not a good idea. After a long debugging process, I am convinced now that the existing logic is valid and should work well except for one case, where all input tdigests to the tdigest merge are empty. So, I have decided to add a [shortcut to handle that particular edge case](https://github.com/rapidsai/cudf/pull/16897/files#diff-c03df2b421f7a51b28007d575fd32ba2530970351ba7e7e0f7fad8057350870cR1349-R1354) in `group_merge_tdigest()` in this PR. This shortcut is executed only when all clusters are empty in all groups. This PR does not change any other logic. Other changes in this PR are:

- New unit tests to cover the edge case.
- `make_empty_tdigest_column` has been renamed to `make_tdigest_column_of_empty_clusters` and expanded to take `num_rows`.
- Some new documentation based on my understanding for the `merge_tdigests()` function.

Before making this PR, I have run the integration tests of the spark-rapids that were previously reported in https://github.com/NVIDIA/spark-rapids/issues/11463 that my first attempt had caused them failing. They have all passed with this PR change.

Authors:
  - Jihoon Son (https://github.com/jihoonson)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/16897
---
 cpp/include/cudf/detail/tdigest/tdigest.hpp   |  18 +-
 cpp/include/cudf_test/tdigest_utilities.cuh   |  20 +-
 cpp/src/quantiles/tdigest/tdigest.cu          |  23 +--
 .../quantiles/tdigest/tdigest_aggregation.cu  | 186 ++++++++++++------
 cpp/tests/groupby/tdigest_tests.cu            | 135 ++++++++++++-
 .../quantiles/percentile_approx_test.cpp      |   4 +-
 6 files changed, 288 insertions(+), 98 deletions(-)

diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 80a4460023f..4295f5e6ddd 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -143,28 +143,30 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create an empty tdigest column.
+ * @brief Create a tdigest column of empty tdigests.
  *
- * An empty tdigest column contains a single row of length 0
+ * The column created contains the specified number of rows of empty tdigests.
  *
+ * @param num_rows The number of rows in the output column.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns An empty tdigest column.
+ * @returns A tdigest column of empty clusters.
  */
 CUDF_EXPORT
-std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr);
+std::unique_ptr<column> make_empty_tdigests_column(size_type num_rows,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create an empty tdigest scalar.
+ * @brief Create a scalar of an empty tdigest cluster.
  *
- * An empty tdigest scalar is a struct_scalar that contains a single row of length 0
+ * The returned scalar is a struct_scalar that contains a single row of an empty cluster.
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns An empty tdigest scalar.
+ * @returns A scalar of an empty tdigest cluster.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index 1758790cd64..c259d61060b 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -562,12 +562,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto b = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto b = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 0d017cf1f13..43c3b0a291b 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -292,32 +292,33 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
   return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr);
 }
 
-std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr)
+std::unique_ptr<column> make_empty_tdigests_column(size_type num_rows,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
 {
   auto offsets = cudf::make_fixed_width_column(
-    data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                offsets->mutable_view().begin<size_type>(),
                offsets->mutable_view().end<size_type>(),
                0);
 
-  auto min_col =
-    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  auto min_col = cudf::make_numeric_column(
+    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                min_col->mutable_view().begin<double>(),
                min_col->mutable_view().end<double>(),
                0);
-  auto max_col =
-    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  auto max_col = cudf::make_numeric_column(
+    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                max_col->mutable_view().begin<double>(),
                max_col->mutable_view().end<double>(),
                0);
 
-  return make_tdigest_column(1,
-                             make_empty_column(type_id::FLOAT64),
-                             make_empty_column(type_id::FLOAT64),
+  return make_tdigest_column(num_rows,
+                             cudf::make_empty_column(type_id::FLOAT64),
+                             cudf::make_empty_column(type_id::FLOAT64),
                              std::move(offsets),
                              std::move(min_col),
                              std::move(max_col),
@@ -338,7 +339,7 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr)
 {
-  auto contents = make_empty_tdigest_column(stream, mr)->release();
+  auto contents = make_empty_tdigests_column(1, stream, mr)->release();
   return std::make_unique<struct_scalar>(
     std::move(*std::make_unique<table>(std::move(contents.children))), true, stream, mr);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index e1c1d2e3002..b0a84a6d50c 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -169,19 +169,19 @@ struct nearest_value_scalar_weights {
  */
 template <typename GroupOffsetsIter>
 struct nearest_value_centroid_weights {
-  double const* cumulative_weights;
-  GroupOffsetsIter outer_offsets;  // groups
-  size_type const* inner_offsets;  // tdigests within a group
+  double const* cumulative_weights;  // cumulative weights of non-empty clusters
+  GroupOffsetsIter group_offsets;    // groups
+  size_type const* tdigest_offsets;  // tdigests within a group
 
   thrust::pair<double, int> operator() __device__(double next_limit, size_type group_index) const
   {
-    auto const tdigest_begin = outer_offsets[group_index];
-    auto const tdigest_end   = outer_offsets[group_index + 1];
-    auto const num_weights   = inner_offsets[tdigest_end] - inner_offsets[tdigest_begin];
+    auto const tdigest_begin = group_offsets[group_index];
+    auto const tdigest_end   = group_offsets[group_index + 1];
+    auto const num_weights   = tdigest_offsets[tdigest_end] - tdigest_offsets[tdigest_begin];
     // NOTE: as it is today, this functor will never be called for any digests that are empty, but
     // I'll leave this check here for safety.
     if (num_weights == 0) { return thrust::pair<double, int>{0, 0}; }
-    double const* group_cumulative_weights = cumulative_weights + inner_offsets[tdigest_begin];
+    double const* group_cumulative_weights = cumulative_weights + tdigest_offsets[tdigest_begin];
 
     auto const index = ((thrust::lower_bound(thrust::seq,
                                              group_cumulative_weights,
@@ -235,21 +235,26 @@ struct cumulative_scalar_weight {
  */
 template <typename GroupLabelsIter, typename GroupOffsetsIter>
 struct cumulative_centroid_weight {
-  double const* cumulative_weights;
-  GroupLabelsIter group_labels;
-  GroupOffsetsIter outer_offsets;                    // groups
-  cudf::device_span<size_type const> inner_offsets;  // tdigests with a group
-
+  double const* cumulative_weights;  // cumulative weights of non-empty clusters
+  GroupLabelsIter group_labels;      // group labels for each tdigest including empty ones
+  GroupOffsetsIter group_offsets;    // groups
+  cudf::device_span<size_type const> tdigest_offsets;  // tdigests with a group
+
+  /**
+   * @brief Returns the cumulative weight for a given value index. The index `n` is the index of
+   * `n`-th non-empty cluster.
+   */
   std::tuple<size_type, size_type, double> operator() __device__(size_type value_index) const
   {
     auto const tdigest_index =
       static_cast<size_type>(
-        thrust::upper_bound(thrust::seq, inner_offsets.begin(), inner_offsets.end(), value_index) -
-        inner_offsets.begin()) -
+        thrust::upper_bound(
+          thrust::seq, tdigest_offsets.begin(), tdigest_offsets.end(), value_index) -
+        tdigest_offsets.begin()) -
       1;
     auto const group_index                 = group_labels[tdigest_index];
-    auto const first_tdigest_index         = outer_offsets[group_index];
-    auto const first_weight_index          = inner_offsets[first_tdigest_index];
+    auto const first_tdigest_index         = group_offsets[group_index];
+    auto const first_weight_index          = tdigest_offsets[first_tdigest_index];
     auto const relative_value_index        = value_index - first_weight_index;
     double const* group_cumulative_weights = cumulative_weights + first_weight_index;
 
@@ -284,15 +289,15 @@ struct scalar_group_info {
 // retrieve group info of centroid inputs by group index
 template <typename GroupOffsetsIter>
 struct centroid_group_info {
-  double const* cumulative_weights;
-  GroupOffsetsIter outer_offsets;
-  size_type const* inner_offsets;
+  double const* cumulative_weights;  // cumulative weights of non-empty clusters
+  GroupOffsetsIter group_offsets;
+  size_type const* tdigest_offsets;
 
   __device__ thrust::tuple<double, size_type, size_type> operator()(size_type group_index) const
   {
     // if there's no weights in this group of digests at all, return 0.
-    auto const group_start       = inner_offsets[outer_offsets[group_index]];
-    auto const group_end         = inner_offsets[outer_offsets[group_index + 1]];
+    auto const group_start       = tdigest_offsets[group_offsets[group_index]];
+    auto const group_end         = tdigest_offsets[group_offsets[group_index + 1]];
     auto const num_weights       = group_end - group_start;
     auto const last_weight_index = group_end - 1;
     return num_weights == 0
@@ -367,7 +372,6 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
  * @param group_num_clusters  Output.  The number of output clusters for each input group.
  * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
  * @param has_nulls Whether or not the input contains nulls
- *
  */
 
 template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
@@ -661,6 +665,10 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                                     mr);
 }
 
+/**
+ * @brief A functor which returns the cluster index within a group that the value at
+ * the given value index falls into.
+ */
 template <typename CumulativeWeight>
 struct compute_tdigests_keys_fn {
   int const delta;
@@ -706,8 +714,8 @@ struct compute_tdigests_keys_fn {
  * boundaries.
  *
  * @param delta              tdigest compression level
- * @param values_begin       Beginning of the range of input values.
- * @param values_end         End of the range of input values.
+ * @param centroids_begin    Beginning of the range of centroids.
+ * @param centroids_end      End of the range of centroids.
  * @param cumulative_weight  Functor which returns cumulative weight and group information for
  * an absolute input value index.
  * @param min_col            Column containing the minimum value per group.
@@ -750,7 +758,9 @@ std::unique_ptr<column> compute_tdigests(int delta,
   //   double       // max
   // }
   //
-  if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
+  if (total_clusters == 0) {
+    return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr);
+  }
 
   // each input group represents an individual tdigest.  within each tdigest, we want the keys
   // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
@@ -983,38 +993,54 @@ struct typed_reduce_tdigest {
   }
 };
 
-// utility for merge_tdigests.
+/**
+ * @brief Functor to compute the number of clusters in each group.
+ *
+ * Used in `merge_tdigests`.
+ */
 template <typename GroupOffsetsIter>
-struct group_num_weights_func {
-  GroupOffsetsIter outer_offsets;
-  size_type const* inner_offsets;
+struct group_num_clusters_func {
+  GroupOffsetsIter group_offsets;
+  size_type const* tdigest_offsets;
 
   __device__ size_type operator()(size_type group_index)
   {
-    auto const tdigest_begin = outer_offsets[group_index];
-    auto const tdigest_end   = outer_offsets[group_index + 1];
-    return inner_offsets[tdigest_end] - inner_offsets[tdigest_begin];
+    auto const tdigest_begin = group_offsets[group_index];
+    auto const tdigest_end   = group_offsets[group_index + 1];
+    return tdigest_offsets[tdigest_end] - tdigest_offsets[tdigest_begin];
   }
 };
 
-// utility for merge_tdigests.
+/**
+ * @brief Function to determine if a group is empty.
+ *
+ * Used in `merge_tdigests`.
+ */
 struct group_is_empty {
   __device__ bool operator()(size_type group_size) { return group_size == 0; }
 };
 
-// utility for merge_tdigests.
+/**
+ * @brief Functor that returns the grouping key for each tdigest cluster.
+ *
+ * Used in `merge_tdigests`.
+ */
 template <typename GroupLabelsIter>
 struct group_key_func {
   GroupLabelsIter group_labels;
-  size_type const* inner_offsets;
-  size_type num_inner_offsets;
+  size_type const* tdigest_offsets;
+  size_type num_tdigest_offsets;
 
+  /**
+   * @brief Returns the group index for an absolute cluster index. The index `n` is the index of the
+   * `n`-th non-empty cluster.
+   */
   __device__ size_type operator()(size_type index)
   {
     // what -original- tdigest index this absolute index corresponds to
-    auto const iter = thrust::prev(
-      thrust::upper_bound(thrust::seq, inner_offsets, inner_offsets + num_inner_offsets, index));
-    auto const tdigest_index = thrust::distance(inner_offsets, iter);
+    auto const iter          = thrust::prev(thrust::upper_bound(
+      thrust::seq, tdigest_offsets, tdigest_offsets + num_tdigest_offsets, index));
+    auto const tdigest_index = thrust::distance(tdigest_offsets, iter);
 
     // what group index the original tdigest belongs to
     return group_labels[tdigest_index];
@@ -1040,8 +1066,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
 
   // each group represents a collection of tdigest columns. each row is 1 tdigest.
   // within each group, we want to sort all the centroids within all the tdigests
-  // in that group, using the means as the key. the "outer offsets" represent the indices of the
-  // tdigests, and the "inner offsets" represents the list of centroids for a particular tdigest.
+  // in that group, using the means as the key. the "group offsets" represent the indices of the
+  // tdigests, and the "tdigest offsets" represents the list of centroids for a particular tdigest.
   //
   //  rows
   //  ----        centroid 0 ---------
@@ -1054,12 +1080,12 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
   //  tdigest 3   centroid 7
   //              centroid 8
   //  ----        centroid 9 --------
-  auto inner_offsets    = tdv.centroids().offsets();
+  auto tdigest_offsets  = tdv.centroids().offsets();
   auto centroid_offsets = cudf::detail::make_counting_transform_iterator(
     0,
     cuda::proclaim_return_type<size_type>(
-      [group_offsets, inner_offsets = tdv.centroids().offsets().begin<size_type>()] __device__(
-        size_type i) { return inner_offsets[group_offsets[i]]; }));
+      [group_offsets, tdigest_offsets = tdv.centroids().offsets().begin<size_type>()] __device__(
+        size_type i) { return tdigest_offsets[group_offsets[i]]; }));
 
   // perform the sort using the means as the key
   size_t temp_size;
@@ -1091,9 +1117,34 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
   return {std::move(output_means), std::move(output_weights)};
 }
 
+/**
+ * @brief Perform a merge aggregation of tdigests. This function usually takes the input as the
+ * outputs of multiple `typed_group_tdigest` calls, and merges them.
+ *
+ * A tdigest can be empty in the input, which means that there was no valid input data to generate
+ * it. These empty tdigests will have no centroids (means or weights) and will have a `min` and
+ * `max` of 0.
+ *
+ * @param tdv input tdigests. The tdigests within this column are grouped by key.
+ * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is
+ * counted as one even when the cluster is empty in it. The offsets should have the same values as
+ * the ones in `group_offsets`.
+ * @param group_offsets a device iterator of the offsets to the start of each group. A group is
+ * counted as one even when the cluster is empty in it. The offsets should have the same values as
+ * the ones in `h_group_offsets`.
+ * @param group_labels a device iterator of the the group label for each tdigest cluster including
+ * empty clusters.
+ * @param num_group_labels the number of unique group labels.
+ * @param num_groups the number of groups.
+ * @param max_centroids the maximum number of centroids (clusters) in the output (merged) tdigest.
+ * @param stream CUDA stream
+ * @param mr device memory resource
+ *
+ * @return A column containing the merged tdigests.
+ */
 template <typename HGroupOffsetIter, typename GroupOffsetIter, typename GroupLabelIter>
 std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
-                                       HGroupOffsetIter h_outer_offsets,
+                                       HGroupOffsetIter h_group_offsets,
                                        GroupOffsetIter group_offsets,
                                        GroupLabelIter group_labels,
                                        size_t num_group_labels,
@@ -1133,22 +1184,24 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                         thrust::equal_to{},  // key equality check
                         thrust::maximum{});
 
+  auto tdigest_offsets = tdv.centroids().offsets();
+
   // for any empty groups, set the min and max to be 0. not technically necessary but it makes
   // testing simpler.
-  auto group_num_weights = cudf::detail::make_counting_transform_iterator(
+  auto group_num_clusters = cudf::detail::make_counting_transform_iterator(
     0,
-    group_num_weights_func<decltype(group_offsets)>{group_offsets,
-                                                    tdv.centroids().offsets().begin<size_type>()});
+    group_num_clusters_func<decltype(group_offsets)>{group_offsets,
+                                                     tdigest_offsets.begin<size_type>()});
   thrust::replace_if(rmm::exec_policy(stream),
                      merged_min_col->mutable_view().begin<double>(),
                      merged_min_col->mutable_view().end<double>(),
-                     group_num_weights,
+                     group_num_clusters,
                      group_is_empty{},
                      0);
   thrust::replace_if(rmm::exec_policy(stream),
                      merged_max_col->mutable_view().begin<double>(),
                      merged_max_col->mutable_view().end<double>(),
-                     group_num_weights,
+                     group_num_clusters,
                      group_is_empty{},
                      0);
 
@@ -1166,14 +1219,13 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   // generate group keys for all centroids in the entire column
   rmm::device_uvector<size_type> group_keys(num_centroids, stream, temp_mr);
-  auto iter          = thrust::make_counting_iterator(0);
-  auto inner_offsets = tdv.centroids().offsets();
+  auto iter = thrust::make_counting_iterator(0);
   thrust::transform(rmm::exec_policy(stream),
                     iter,
                     iter + num_centroids,
                     group_keys.begin(),
                     group_key_func<decltype(group_labels)>{
-                      group_labels, inner_offsets.begin<size_type>(), inner_offsets.size()});
+                      group_labels, tdigest_offsets.begin<size_type>(), tdigest_offsets.size()});
   thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
                                 group_keys.begin(),
                                 group_keys.begin() + num_centroids,
@@ -1182,20 +1234,24 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   auto const delta = max_centroids;
 
+  // TDigest merge takes the output of typed_group_tdigest as its input, which must not have
+  // any nulls.
+  auto const has_nulls = false;
+
   // generate cluster info
   auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
     delta,
     num_groups,
     nearest_value_centroid_weights<decltype(group_offsets)>{
-      cumulative_weights.begin(), group_offsets, inner_offsets.begin<size_type>()},
+      cumulative_weights.begin(), group_offsets, tdigest_offsets.begin<size_type>()},
     centroid_group_info<decltype(group_offsets)>{
-      cumulative_weights.begin(), group_offsets, inner_offsets.begin<size_type>()},
+      cumulative_weights.begin(), group_offsets, tdigest_offsets.begin<size_type>()},
     cumulative_centroid_weight<decltype(group_labels), decltype(group_offsets)>{
       cumulative_weights.begin(),
       group_labels,
       group_offsets,
-      {inner_offsets.begin<size_type>(), static_cast<size_t>(inner_offsets.size())}},
-    false,
+      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+    has_nulls,
     stream,
     mr);
 
@@ -1212,13 +1268,13 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       cumulative_weights.begin(),
       group_labels,
       group_offsets,
-      {inner_offsets.begin<size_type>(), static_cast<size_t>(inner_offsets.size())}},
+      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
     std::move(merged_min_col),
     std::move(merged_max_col),
     group_cluster_wl,
     std::move(group_cluster_offsets),
     total_clusters,
-    false,
+    has_nulls,
     stream,
     mr);
 }
@@ -1283,7 +1339,7 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
+  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr); }
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(col.type(),
@@ -1309,7 +1365,15 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   tdigest_column_view tdv(input);
 
   if (num_groups == 0 || input.size() == 0) {
-    return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr);
+    return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr);
+  }
+
+  if (tdv.means().size() == 0) {
+    // `group_merge_tdigest` takes the output of `typed_group_tdigest` as its input, which wipes
+    // out the means and weights for empty clusters. Thus, no mean here indicates that all clusters
+    // are empty in the input. Let's skip all complex computation in the below, but just return
+    // an empty tdigest per group.
+    return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr);
   }
 
   // bring group offsets back to the host
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index baa59026b07..4ae5d06b214 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto e = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
@@ -507,3 +507,126 @@ TEST_F(TDigestMergeTest, EmptyGroups)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
 }
+
+std::unique_ptr<cudf::table> do_agg(
+  cudf::column_view key,
+  cudf::column_view val,
+  std::function<std::unique_ptr<cudf::groupby_aggregation>()> make_agg)
+{
+  std::vector<cudf::column_view> keys;
+  keys.push_back(key);
+  cudf::table_view const key_table(keys);
+
+  cudf::groupby::groupby gb(key_table);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  cudf::groupby::aggregation_request req;
+  req.values = val;
+  req.aggregations.push_back(make_agg());
+  requests.push_back(std::move(req));
+
+  auto result = gb.aggregate(std::move(requests));
+
+  std::vector<std::unique_ptr<cudf::column>> result_columns;
+  for (auto&& c : result.first->release()) {
+    result_columns.push_back(std::move(c));
+  }
+
+  EXPECT_EQ(result.second.size(), 1);
+  EXPECT_EQ(result.second[0].results.size(), 1);
+  result_columns.push_back(std::move(result.second[0].results[0]));
+
+  return std::make_unique<cudf::table>(std::move(result_columns));
+}
+
+TEST_F(TDigestMergeTest, AllValuesAreNull)
+{
+  // The input must be sorted by the key.
+  // See `aggregate_result_functor::operator()<aggregation::TDIGEST>` for details.
+  auto const keys      = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 0, 1, 1, 2}};
+  auto const keys_view = cudf::column_view(keys);
+  auto val_elems  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+    // All values are null
+    return false;
+  });
+  auto const vals = cudf::test::fixed_width_column_wrapper<int32_t>{
+    val_elems, val_elems + keys_view.size(), val_valids};
+
+  auto const delta = 1000;
+
+  // Compute tdigest. The result should have 3 empty clusters, one per group.
+  auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() {
+    return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+  });
+
+  auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
+  cudf::column_view const expected_computed_keys_view{expected_computed_keys};
+  auto const expected_computed_vals =
+    cudf::tdigest::detail::make_empty_tdigests_column(expected_computed_keys_view.size(),
+                                                      cudf::get_default_stream(),
+                                                      rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view());
+  // The computed values are nullable even though the input values are not.
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(),
+                                 compute_result->get_column(1).view());
+
+  // Merge tdigest. The result should have 3 empty clusters, one per group.
+  auto const merge_result =
+    do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
+      return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+    });
+
+  auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
+  cudf::column_view const expected_merged_keys_view{expected_merged_keys};
+  auto const expected_merged_vals =
+    cudf::tdigest::detail::make_empty_tdigests_column(expected_merged_keys_view.size(),
+                                                      cudf::get_default_stream(),
+                                                      rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view());
+}
+
+TEST_F(TDigestMergeTest, AllValuesInOneGroupIsNull)
+{
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 1, 2, 2, 3};
+  cudf::test::fixed_width_column_wrapper<double> vals{{10.0, 20.0, {}, {}, 30.0},
+                                                      {true, true, false, false, true}};
+
+  auto const delta = 1000;
+
+  // Compute tdigest. The result should have 3 empty clusters, one per group.
+  auto const compute_result = do_agg(cudf::column_view(keys), cudf::column_view(vals), [&delta]() {
+    return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+  });
+
+  auto const expected_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2, 3}};
+
+  cudf::test::fixed_width_column_wrapper<double> expected_means{10, 20, 30};
+  cudf::test::fixed_width_column_wrapper<double> expected_weights{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_offsets{0, 1, 2, 2, 3};
+  cudf::test::fixed_width_column_wrapper<double> expected_mins{10.0, 20.0, 0.0, 30.0};
+  cudf::test::fixed_width_column_wrapper<double> expected_maxes{10.0, 20.0, 0.0, 30.0};
+  auto const expected_values =
+    cudf::tdigest::detail::make_tdigest_column(4,
+                                               std::make_unique<cudf::column>(expected_means),
+                                               std::make_unique<cudf::column>(expected_weights),
+                                               std::make_unique<cudf::column>(expected_offsets),
+                                               std::make_unique<cudf::column>(expected_mins),
+                                               std::make_unique<cudf::column>(expected_maxes),
+                                               cudf::get_default_stream(),
+                                               rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys},
+                                 compute_result->get_column(0).view());
+  // The computed values are nullable even though the input values are not.
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), compute_result->get_column(1).view());
+
+  // Merge tdigest. The result should have 3 empty clusters, one per group.
+  auto const merge_result =
+    do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
+      return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+    });
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys},
+                                 merge_result->get_column(0).view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), merge_result->get_column(1).view());
+}
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 915717713df..37414eb3fba 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {};
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;

From f9567a5c41af859b1674de837db41443879ea25c Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 1 Oct 2024 10:29:22 -0700
Subject: [PATCH 3/9] Improve aggregation device functors (#16884)

While working on #16619, I noticed that `aggregate_row` is always instantiated with the same template values, making the template parameters unnecessary. This PR simplifies the function by removing the template parameters and moving the device aggregators to their own header. This is a preparatory step for #16619, where additional overloads of the device aggregators will be introduced.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16884
---
 .../cudf/detail/aggregation/aggregation.cuh   | 472 +-----------------
 .../detail/aggregation/device_aggregators.cuh | 443 ++++++++++++++++
 cpp/src/aggregation/aggregation.cu            |   6 +-
 cpp/src/groupby/hash/groupby_kernels.cuh      |   4 +-
 .../sort/group_single_pass_reduction_util.cuh |   1 +
 5 files changed, 462 insertions(+), 464 deletions(-)
 create mode 100644 cpp/include/cudf/detail/aggregation/device_aggregators.cuh

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index ecf2f610697..de53e7586cd 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -18,11 +18,11 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/table/table_device_view.cuh>
+#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/traits.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -30,8 +30,17 @@
 
 #include <thrust/fill.h>
 
+#include <type_traits>
+#include <vector>
+
 namespace cudf {
 namespace detail {
+template <typename T>
+constexpr bool is_product_supported()
+{
+  return is_numeric<T>();
+}
+
 /**
  * @brief Maps an `aggregation::Kind` value to it's corresponding binary
  * operator.
@@ -113,465 +122,6 @@ constexpr bool has_corresponding_operator()
   return !std::is_same_v<typename corresponding_operator<k>::type, void>;
 }
 
-template <typename Source,
-          aggregation::Kind k,
-          bool target_has_nulls,
-          bool source_has_nulls,
-          typename Enable = void>
-struct update_target_element {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !is_fixed_point<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::MIN>;
-    cudf::detail::atomic_min(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target       = target_type_t<Source, aggregation::MIN>;
-    using DeviceTarget = device_storage_type_t<Target>;
-    using DeviceSource = device_storage_type_t<Source>;
-
-    cudf::detail::atomic_min(&target.element<DeviceTarget>(target_index),
-                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !is_fixed_point<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::MAX>;
-    cudf::detail::atomic_max(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target       = target_type_t<Source, aggregation::MAX>;
-    using DeviceTarget = device_storage_type_t<Target>;
-    using DeviceSource = device_storage_type_t<Source>;
-
-    cudf::detail::atomic_max(&target.element<DeviceTarget>(target_index),
-                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::SUM>;
-    cudf::detail::atomic_add(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target       = target_type_t<Source, aggregation::SUM>;
-    using DeviceTarget = device_storage_type_t<Target>;
-    using DeviceSource = device_storage_type_t<Source>;
-
-    cudf::detail::atomic_add(&target.element<DeviceTarget>(target_index),
-                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-/**
- * @brief Function object to update a single element in a target column using
- * the dictionary key addressed by the specific index.
- *
- * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a
- * dictionary.
- *
- */
-template <bool target_has_nulls = true>
-struct update_target_from_dictionary {
-  template <typename Source,
-            aggregation::Kind k,
-            std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    update_target_element<Source, k, target_has_nulls, false>{}(
-      target, target_index, source, source_index);
-  }
-  template <typename Source,
-            aggregation::Kind k,
-            std::enable_if_t<is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-  }
-};
-
-/**
- * @brief Specialization function for dictionary type and aggregations.
- *
- * The `source` column is a dictionary type. This functor de-references the
- * dictionary's keys child column and maps the input source index through
- * the dictionary's indices child column to pass to the `update_target_element`
- * in the above `update_target_from_dictionary` using the type-dispatcher to
- * resolve the keys column type.
- *
- * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )`
- *
- * @tparam target_has_nulls Indicates presence of null elements in `target`
- * @tparam source_has_nulls Indicates presence of null elements in `source`.
- */
-template <aggregation::Kind k, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  dictionary32,
-  k,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
-                       k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    dispatch_type_and_aggregation(
-      source.child(cudf::dictionary_column_view::keys_column_index).type(),
-      k,
-      update_target_from_dictionary<target_has_nulls>{},
-      target,
-      target_index,
-      source.child(cudf::dictionary_column_view::keys_column_index),
-      static_cast<cudf::size_type>(source.element<dictionary32>(source_index)));
-  }
-};
-
-template <typename T>
-constexpr bool is_product_supported()
-{
-  return is_numeric<T>();
-}
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::SUM_OF_SQUARES,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_product_supported<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
-    auto value   = static_cast<Target>(source.element<Source>(source_index));
-    cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::PRODUCT,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_product_supported<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::PRODUCT>;
-    cudf::detail::atomic_mul(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::COUNT_VALID,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_valid_aggregation<Source, aggregation::COUNT_VALID>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::COUNT_VALID>;
-    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
-
-    // It is assumed the output for COUNT_VALID is initialized to be all valid
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::COUNT_ALL,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_valid_aggregation<Source, aggregation::COUNT_ALL>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    using Target = target_type_t<Source, aggregation::COUNT_ALL>;
-    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
-
-    // It is assumed the output for COUNT_ALL is initialized to be all valid
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::ARGMAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_valid_aggregation<Source, aggregation::ARGMAX>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::ARGMAX>;
-    auto old     = cudf::detail::atomic_cas(
-      &target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
-    if (old != ARGMAX_SENTINEL) {
-      while (source.element<Source>(source_index) > source.element<Source>(old)) {
-        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
-      }
-    }
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::ARGMIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_valid_aggregation<Source, aggregation::ARGMIN>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::ARGMIN>;
-    auto old     = cudf::detail::atomic_cas(
-      &target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
-    if (old != ARGMIN_SENTINEL) {
-      while (source.element<Source>(source_index) < source.element<Source>(old)) {
-        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
-      }
-    }
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-/**
- * @brief Function object to update a single element in a target column by
- * performing an aggregation operation with a single element from a source
- * column.
- *
- * @tparam target_has_nulls Indicates presence of null elements in `target`
- * @tparam source_has_nulls Indicates presence of null elements in `source`.
- */
-template <bool target_has_nulls = true, bool source_has_nulls = true>
-struct elementwise_aggregator {
-  template <typename Source, aggregation::Kind k>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    update_target_element<Source, k, target_has_nulls, source_has_nulls>{}(
-      target, target_index, source, source_index);
-  }
-};
-
-/**
- * @brief Updates a row in `target` by performing elementwise aggregation
- * operations with a row in `source`.
- *
- * For the row in `target` specified by `target_index`, each element at `i` is
- * updated by:
- * ```c++
- * target_row[i] = aggs[i](target_row[i], source_row[i])
- * ```
- *
- * This function only supports aggregations that can be done in a "single pass",
- * i.e., given an initial value `R`, the aggregation `op` can be computed on a series
- * of elements `e[i] for i in [0,n)` by computing `R = op(e[i],R)` for any order
- * of the values of `i`.
- *
- * The initial value and validity of `R` depends on the aggregation:
- * SUM: 0 and NULL
- * MIN: Max value of type and NULL
- * MAX: Min value of type and NULL
- * COUNT_VALID: 0 and VALID
- * COUNT_ALL:   0 and VALID
- * ARGMAX: `ARGMAX_SENTINEL` and NULL
- * ARGMIN: `ARGMIN_SENTINEL` and NULL
- *
- * It is required that the elements of `target` be initialized with the corresponding
- * initial values and validity specified above.
- *
- * Handling of null elements in both `source` and `target` depends on the aggregation:
- * SUM, MIN, MAX, ARGMIN, ARGMAX:
- *  - `source`: Skipped
- *  - `target`: Updated from null to valid upon first successful aggregation
- * COUNT_VALID, COUNT_ALL:
- *  - `source`: Skipped
- *  - `target`: Cannot be null
- *
- * @param target Table containing the row to update
- * @param target_index Index of the row to update in `target`
- * @param source Table containing the row used to update the row in `target`.
- * The invariant `source.num_columns() >= target.num_columns()` must hold.
- * @param source_index Index of the row to use in `source`
- * @param aggs Array of aggregations to perform between elements of the `target`
- * and `source` rows. Must contain at least `target.num_columns()` valid
- * `aggregation::Kind` values.
- */
-template <bool target_has_nulls = true, bool source_has_nulls = true>
-__device__ inline void aggregate_row(mutable_table_device_view target,
-                                     size_type target_index,
-                                     table_device_view source,
-                                     size_type source_index,
-                                     aggregation::Kind const* aggs)
-{
-  for (auto i = 0; i < target.num_columns(); ++i) {
-    dispatch_type_and_aggregation(source.column(i).type(),
-                                  aggs[i],
-                                  elementwise_aggregator<target_has_nulls, source_has_nulls>{},
-                                  target.column(i),
-                                  target_index,
-                                  source.column(i),
-                                  source_index);
-  }
-}
-
 /**
  * @brief Dispatched functor to initialize a column with the identity of an
  * aggregation operation.
diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
new file mode 100644
index 00000000000..10be5e1d36f
--- /dev/null
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/traits.cuh>
+
+#include <cuda/std/type_traits>
+
+namespace cudf::detail {
+template <typename Source, aggregation::Kind k, typename Enable = void>
+struct update_target_element {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MIN,
+  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !is_fixed_point<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::MIN>;
+    cudf::detail::atomic_min(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MIN,
+  cuda::std::enable_if_t<is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target       = target_type_t<Source, aggregation::MIN>;
+    using DeviceTarget = device_storage_type_t<Target>;
+    using DeviceSource = device_storage_type_t<Source>;
+
+    cudf::detail::atomic_min(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MAX,
+  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !is_fixed_point<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::MAX>;
+    cudf::detail::atomic_max(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MAX,
+  cuda::std::enable_if_t<is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target       = target_type_t<Source, aggregation::MAX>;
+    using DeviceTarget = device_storage_type_t<Target>;
+    using DeviceSource = device_storage_type_t<Source>;
+
+    cudf::detail::atomic_max(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::SUM,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::SUM>;
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::SUM,
+  cuda::std::enable_if_t<is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target       = target_type_t<Source, aggregation::SUM>;
+    using DeviceTarget = device_storage_type_t<Target>;
+    using DeviceSource = device_storage_type_t<Source>;
+
+    cudf::detail::atomic_add(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+/**
+ * @brief Function object to update a single element in a target column using
+ * the dictionary key addressed by the specific index.
+ *
+ * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a
+ * dictionary.
+ *
+ */
+struct update_target_from_dictionary {
+  template <typename Source,
+            aggregation::Kind k,
+            cuda::std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    update_target_element<Source, k>{}(target, target_index, source, source_index);
+  }
+  template <typename Source,
+            aggregation::Kind k,
+            cuda::std::enable_if_t<is_dictionary<Source>()>* = nullptr>
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+  }
+};
+
+/**
+ * @brief Specialization function for dictionary type and aggregations.
+ *
+ * The `source` column is a dictionary type. This functor de-references the
+ * dictionary's keys child column and maps the input source index through
+ * the dictionary's indices child column to pass to the `update_target_element`
+ * in the above `update_target_from_dictionary` using the type-dispatcher to
+ * resolve the keys column type.
+ *
+ * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )`
+ */
+template <aggregation::Kind k>
+struct update_target_element<
+  dictionary32,
+  k,
+  cuda::std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
+                             k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    dispatch_type_and_aggregation(
+      source.child(cudf::dictionary_column_view::keys_column_index).type(),
+      k,
+      update_target_from_dictionary{},
+      target,
+      target_index,
+      source.child(cudf::dictionary_column_view::keys_column_index),
+      static_cast<cudf::size_type>(source.element<dictionary32>(source_index)));
+  }
+};
+
+template <typename Source>
+struct update_target_element<Source,
+                             aggregation::SUM_OF_SQUARES,
+                             cuda::std::enable_if_t<is_product_supported<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
+    auto value   = static_cast<Target>(source.element<Source>(source_index));
+    cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<Source,
+                             aggregation::PRODUCT,
+                             cuda::std::enable_if_t<is_product_supported<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::PRODUCT>;
+    cudf::detail::atomic_mul(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::COUNT_VALID,
+  cuda::std::enable_if_t<is_valid_aggregation<Source, aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::COUNT_VALID>;
+    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
+
+    // It is assumed the output for COUNT_VALID is initialized to be all valid
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::COUNT_ALL,
+  cuda::std::enable_if_t<is_valid_aggregation<Source, aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    using Target = target_type_t<Source, aggregation::COUNT_ALL>;
+    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
+
+    // It is assumed the output for COUNT_ALL is initialized to be all valid
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::ARGMAX,
+  cuda::std::enable_if_t<is_valid_aggregation<Source, aggregation::ARGMAX>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::ARGMAX>;
+    auto old     = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
+    if (old != ARGMAX_SENTINEL) {
+      while (source.element<Source>(source_index) > source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
+      }
+    }
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::ARGMIN,
+  cuda::std::enable_if_t<is_valid_aggregation<Source, aggregation::ARGMIN>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::ARGMIN>;
+    auto old     = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
+    if (old != ARGMIN_SENTINEL) {
+      while (source.element<Source>(source_index) < source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
+      }
+    }
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+/**
+ * @brief Function object to update a single element in a target column by
+ * performing an aggregation operation with a single element from a source
+ * column.
+ */
+struct elementwise_aggregator {
+  template <typename Source, aggregation::Kind k>
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    update_target_element<Source, k>{}(target, target_index, source, source_index);
+  }
+};
+
+/**
+ * @brief Updates a row in `target` by performing elementwise aggregation
+ * operations with a row in `source`.
+ *
+ * For the row in `target` specified by `target_index`, each element at `i` is
+ * updated by:
+ * ```c++
+ * target_row[i] = aggs[i](target_row[i], source_row[i])
+ * ```
+ *
+ * This function only supports aggregations that can be done in a "single pass",
+ * i.e., given an initial value `R`, the aggregation `op` can be computed on a series
+ * of elements `e[i] for i in [0,n)` by computing `R = op(e[i],R)` for any order
+ * of the values of `i`.
+ *
+ * The initial value and validity of `R` depends on the aggregation:
+ * SUM: 0 and NULL
+ * MIN: Max value of type and NULL
+ * MAX: Min value of type and NULL
+ * COUNT_VALID: 0 and VALID
+ * COUNT_ALL:   0 and VALID
+ * ARGMAX: `ARGMAX_SENTINEL` and NULL
+ * ARGMIN: `ARGMIN_SENTINEL` and NULL
+ *
+ * It is required that the elements of `target` be initialized with the corresponding
+ * initial values and validity specified above.
+ *
+ * Handling of null elements in both `source` and `target` depends on the aggregation:
+ * SUM, MIN, MAX, ARGMIN, ARGMAX:
+ *  - `source`: Skipped
+ *  - `target`: Updated from null to valid upon first successful aggregation
+ * COUNT_VALID, COUNT_ALL:
+ *  - `source`: Skipped
+ *  - `target`: Cannot be null
+ *
+ * @param target Table containing the row to update
+ * @param target_index Index of the row to update in `target`
+ * @param source Table containing the row used to update the row in `target`.
+ * The invariant `source.num_columns() >= target.num_columns()` must hold.
+ * @param source_index Index of the row to use in `source`
+ * @param aggs Array of aggregations to perform between elements of the `target`
+ * and `source` rows. Must contain at least `target.num_columns()` valid
+ * `aggregation::Kind` values.
+ */
+__device__ inline void aggregate_row(mutable_table_device_view target,
+                                     size_type target_index,
+                                     table_device_view source,
+                                     size_type source_index,
+                                     aggregation::Kind const* aggs)
+{
+  for (auto i = 0; i < target.num_columns(); ++i) {
+    dispatch_type_and_aggregation(source.column(i).type(),
+                                  aggs[i],
+                                  elementwise_aggregator{},
+                                  target.column(i),
+                                  target_index,
+                                  source.column(i),
+                                  source_index);
+  }
+}
+}  // namespace cudf::detail
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index 02998b84ffd..d915c85bf85 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,13 @@
  */
 
 #include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <vector>
+
 namespace cudf {
 namespace detail {
 void initialize_with_identity(mutable_table_view& table,
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh
index 9abfe22950a..188d0cff3f1 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/groupby_kernels.cuh
@@ -18,8 +18,8 @@
 
 #include "multi_pass_kernels.cuh"
 
-#include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/device_aggregators.cuh>
 #include <cudf/groupby.hpp>
 #include <cudf/utilities/bit.hpp>
 
@@ -100,7 +100,7 @@ struct compute_single_pass_aggs_fn {
     if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) {
       auto const result = set.insert_and_find(i);
 
-      cudf::detail::aggregate_row<true, true>(output_values, *result.first, input_values, i, aggs);
+      cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs);
     }
   }
 };
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 2358f47bbbb..f9adfc6060e 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -25,6 +25,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>

From e46437c39e53a1f952e060d9159477617347b130 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 1 Oct 2024 12:13:47 -1000
Subject: [PATCH 4/9] Add remaining string.char_types APIs to pylibcudf
 (#16788)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Matthew Murray (https://github.com/Matt711)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16788
---
 .../cudf/strings/char_types/char_types.hpp    |   5 +-
 python/cudf/cudf/_lib/strings/char_types.pyx  | 178 ++++++------------
 .../pylibcudf/libcudf/strings/char_types.pxd  |   3 -
 .../pylibcudf/strings/char_types.pxd          |  16 ++
 .../pylibcudf/strings/char_types.pyx          |  89 +++++++++
 .../pylibcudf/tests/test_string_char_types.py |  29 +++
 6 files changed, 195 insertions(+), 125 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_char_types.py

diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index 3ebe5cb53e9..f229facca08 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -30,7 +30,7 @@ namespace strings {
  */
 
 /**
- * @brief Returns a boolean column identifying strings entries in which all
+ * @brief Returns a boolean column identifying string entries where all
  * characters are of the type specified.
  *
  * The output row entry will be set to false if the corresponding string element
@@ -105,7 +105,8 @@ std::unique_ptr<column> all_characters_of_type(
  *        `types_to_remove` will be filtered.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return New column of boolean results for each string
+ * @return New strings column with the characters of specified types filtered out and replaced by
+ * the specified replacement string
  */
 std::unique_ptr<column> filter_characters_of_type(
   strings_column_view const& input,
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 376a6f8af97..a57ce29eb45 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,23 +1,12 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.char_types cimport (
-    all_characters_of_type as cpp_all_characters_of_type,
-    filter_characters_of_type as cpp_filter_characters_of_type,
-    string_character_types,
-)
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf.strings import char_types
 
 
 @acquire_spill_lock()
@@ -25,26 +14,15 @@ def filter_alphanum(Column source_strings, object py_repl, bool keep=True):
     """
     Returns a Column of strings keeping only alphanumeric character types.
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_repl = <const string_scalar*>(
-        repl.get_raw_ptr()
+    plc_column = char_types.filter_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.ALL_TYPES if keep
+        else char_types.StringCharacterTypes.ALPHANUM,
+        py_repl.device_value.c_value,
+        char_types.StringCharacterTypes.ALPHANUM if keep
+        else char_types.StringCharacterTypes.ALL_TYPES
     )
-
-    with nogil:
-        c_result = move(cpp_filter_characters_of_type(
-            source_view,
-            string_character_types.ALL_TYPES if keep
-            else string_character_types.ALPHANUM,
-            scalar_repl[0],
-            string_character_types.ALPHANUM if keep
-            else string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -54,17 +32,12 @@ def is_decimal(Column source_strings):
     that contain only decimal characters -- those that can be used
     to extract base10 numbers.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.DECIMAL,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.DECIMAL,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -75,17 +48,12 @@ def is_alnum(Column source_strings):
 
     Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal()
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.ALPHANUM,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.ALPHANUM,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -94,17 +62,12 @@ def is_alpha(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain only alphabetic characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.ALPHA,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.ALPHA,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -113,17 +76,12 @@ def is_digit(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain only decimal and digit characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.DIGIT,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.DIGIT,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -133,17 +91,12 @@ def is_numeric(Column source_strings):
     that contain only numeric characters. These include digit and
     numeric characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.NUMERIC,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.NUMERIC,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -152,17 +105,12 @@ def is_upper(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain only upper-case characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.UPPER,
-            string_character_types.CASE_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.UPPER,
+        char_types.StringCharacterTypes.CASE_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -171,17 +119,12 @@ def is_lower(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain only lower-case characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.LOWER,
-            string_character_types.CASE_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.LOWER,
+        char_types.StringCharacterTypes.CASE_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -190,14 +133,9 @@ def is_space(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contains all characters which are spaces only.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.SPACE,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.SPACE,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
index 5d54c1c3593..76afe047e8c 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
@@ -22,9 +22,6 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \
         CASE_TYPES
         ALL_TYPES
 
-cdef extern from "cudf/strings/char_types/char_types.hpp" \
-        namespace "cudf::strings" nogil:
-
     cdef unique_ptr[column] all_characters_of_type(
         column_view source_strings,
         string_character_types types,
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/strings/char_types.pxd
index ad4e4cf61d8..f9f7d244212 100644
--- a/python/pylibcudf/pylibcudf/strings/char_types.pxd
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pxd
@@ -1,3 +1,19 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from pylibcudf.column cimport Column
 from pylibcudf.libcudf.strings.char_types cimport string_character_types
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column all_characters_of_type(
+    Column source_strings,
+    string_character_types types,
+    string_character_types verify_types
+)
+
+cpdef Column filter_characters_of_type(
+    Column source_strings,
+    string_character_types types_to_remove,
+    Scalar replacement,
+    string_character_types types_to_keep
+)
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx
index e7621fb4d84..6a24d79bc4b 100644
--- a/python/pylibcudf/pylibcudf/strings/char_types.pyx
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx
@@ -1,4 +1,93 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings cimport char_types as cpp_char_types
+from pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
 from pylibcudf.libcudf.strings.char_types import \
     string_character_types as StringCharacterTypes  # no-cython-lint
+
+
+cpdef Column all_characters_of_type(
+    Column source_strings,
+    string_character_types types,
+    string_character_types verify_types
+):
+    """
+    Identifies strings where all characters match the specified type.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Strings instance for this operation
+    types : StringCharacterTypes
+        The character types to check in each string
+    verify_types : StringCharacterTypes
+        Only verify against these character types.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_char_types.all_characters_of_type(
+                source_strings.view(),
+                types,
+                verify_types,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column filter_characters_of_type(
+    Column source_strings,
+    string_character_types types_to_remove,
+    Scalar replacement,
+    string_character_types types_to_keep
+):
+    """
+    Filter specific character types from a column of strings.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Strings instance for this operation
+    types_to_remove : StringCharacterTypes
+        The character types to check in each string.
+    replacement : Scalar
+        The replacement character to use when removing characters
+    types_to_keep : StringCharacterTypes
+        Default `ALL_TYPES` means all characters of `types_to_remove`
+        will be filtered.
+
+    Returns
+    -------
+    Column
+        New column with the specified characters filtered out and
+        replaced with the specified replacement string.
+    """
+    cdef const string_scalar* c_replacement = <const string_scalar*>(
+        replacement.c_obj.get()
+    )
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_char_types.filter_characters_of_type(
+                source_strings.view(),
+                types_to_remove,
+                dereference(c_replacement),
+                types_to_keep,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
new file mode 100644
index 00000000000..bcd030c019e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_all_characters_of_type():
+    pa_array = pa.array(["1", "A"])
+    result = plc.strings.char_types.all_characters_of_type(
+        plc.interop.from_arrow(pa_array),
+        plc.strings.char_types.StringCharacterTypes.ALPHA,
+        plc.strings.char_types.StringCharacterTypes.ALL_TYPES,
+    )
+    expected = pc.utf8_is_alpha(pa_array)
+    assert_column_eq(result, expected)
+
+
+def test_filter_characters_of_type():
+    pa_array = pa.array(["=A="])
+    result = plc.strings.char_types.filter_characters_of_type(
+        plc.interop.from_arrow(pa_array),
+        plc.strings.char_types.StringCharacterTypes.ALPHANUM,
+        plc.interop.from_arrow(pa.scalar(" ")),
+        plc.strings.char_types.StringCharacterTypes.ALL_TYPES,
+    )
+    expected = pc.replace_substring(pa_array, "A", " ")
+    assert_column_eq(result, expected)

From dae9d6899dd722c52bd42dd0fee51f4a6b336c93 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 1 Oct 2024 12:50:27 -1000
Subject: [PATCH 5/9] Add string.translate APIs to pylibcudf (#16934)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16934
---
 python/cudf/cudf/_lib/strings/translate.pyx   |  93 ++-----------
 .../pylibcudf/libcudf/strings/CMakeLists.txt  |   2 +-
 .../pylibcudf/libcudf/strings/translate.pxd   |  14 +-
 .../pylibcudf/libcudf/strings/translate.pyx   |   0
 .../pylibcudf/strings/CMakeLists.txt          |   1 +
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |   2 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |   2 +
 .../pylibcudf/pylibcudf/strings/translate.pxd |  14 ++
 .../pylibcudf/pylibcudf/strings/translate.pyx | 122 ++++++++++++++++++
 .../pylibcudf/tests/test_string_translate.py  |  69 ++++++++++
 10 files changed, 232 insertions(+), 87 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx
 create mode 100644 python/pylibcudf/pylibcudf/strings/translate.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/translate.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_translate.py

diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx
index 3fad91bbfc0..3ef478532c2 100644
--- a/python/cudf/cudf/_lib/strings/translate.pyx
+++ b/python/cudf/cudf/_lib/strings/translate.pyx
@@ -1,25 +1,12 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.translate cimport (
-    filter_characters as cpp_filter_characters,
-    filter_type,
-    translate as cpp_translate,
-)
-from pylibcudf.libcudf.types cimport char_utf8
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -29,30 +16,11 @@ def translate(Column source_strings,
     Translates individual characters within each string
     if present in the mapping_table.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef int table_size
-    table_size = len(mapping_table)
-
-    cdef vector[pair[char_utf8, char_utf8]] c_mapping_table
-    c_mapping_table.reserve(table_size)
-
-    for key in mapping_table:
-        value = mapping_table[key]
-        if type(value) is int:
-            value = chr(value)
-        if type(value) is str:
-            value = int.from_bytes(value.encode(), byteorder='big')
-        if type(key) is int:
-            key = chr(key)
-        if type(key) is str:
-            key = int.from_bytes(key.encode(), byteorder='big')
-        c_mapping_table.push_back((key, value))
-
-    with nogil:
-        c_result = move(cpp_translate(source_view, c_mapping_table))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.translate.translate(
+        source_strings.to_pylibcudf(mode="read"),
+        mapping_table,
+    )
+    return Column.from_pylibcudf(plc_result)
 
 
 @acquire_spill_lock()
@@ -64,44 +32,11 @@ def filter_characters(Column source_strings,
     Removes or keeps individual characters within each string
     using the provided mapping_table.
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_repl = <const string_scalar*>(
-        repl.get_raw_ptr()
+    plc_result = plc.strings.translate.filter_characters(
+        source_strings.to_pylibcudf(mode="read"),
+        mapping_table,
+        plc.strings.translate.FilterType.KEEP
+        if keep else plc.strings.translate.FilterType.REMOVE,
+        py_repl.device_value.c_value
     )
-    cdef int table_size
-    table_size = len(mapping_table)
-
-    cdef vector[pair[char_utf8, char_utf8]] c_mapping_table
-    c_mapping_table.reserve(table_size)
-
-    for key in mapping_table:
-        value = mapping_table[key]
-        if type(value) is int:
-            value = chr(value)
-        if type(value) is str:
-            value = int.from_bytes(value.encode(), byteorder='big')
-        if type(key) is int:
-            key = chr(key)
-        if type(key) is str:
-            key = int.from_bytes(key.encode(), byteorder='big')
-        c_mapping_table.push_back((key, value))
-
-    cdef filter_type c_keep
-    if keep is True:
-        c_keep = filter_type.KEEP
-    else:
-        c_keep = filter_type.REMOVE
-
-    with nogil:
-        c_result = move(cpp_filter_characters(
-            source_view,
-            c_mapping_table,
-            c_keep,
-            scalar_repl[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
index abf4357f862..b8b4343173e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx)
+set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx)
 
 set(linked_libraries cudf::cudf)
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
index 85fa719128a..9fd24f2987b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
@@ -13,15 +13,15 @@ from pylibcudf.libcudf.types cimport char_utf8
 cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] translate(
-        column_view source_strings,
+        column_view input,
         vector[pair[char_utf8, char_utf8]] chars_table) except +
 
-    ctypedef enum filter_type:
-        KEEP 'cudf::strings::filter_type::KEEP',
-        REMOVE 'cudf::strings::filter_type::REMOVE'
+    cpdef enum class filter_type(bool):
+        KEEP
+        REMOVE
 
     cdef unique_ptr[column] filter_characters(
-        column_view source_strings,
-        vector[pair[char_utf8, char_utf8]] chars_table,
-        filter_type keep,
+        column_view input,
+        vector[pair[char_utf8, char_utf8]] characters_to_filter,
+        filter_type keep_characters,
         string_scalar replacement) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 142bc124ca2..052a0cf3c56 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -28,6 +28,7 @@ set(cython_sources
     side_type.pyx
     slice.pyx
     strip.pyx
+    translate.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index d8afccc7336..142637ff577 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -15,6 +15,7 @@ from . cimport (
     replace,
     slice,
     strip,
+    translate,
 )
 from .side_type cimport side_type
 
@@ -34,4 +35,5 @@ __all__ = [
     "slice",
     "strip",
     "side_type",
+    "translate",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index 22452812e42..decfadd63a4 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -16,6 +16,7 @@
     replace,
     slice,
     strip,
+    translate,
 )
 from .side_type import SideType
 
@@ -35,4 +36,5 @@
     "slice",
     "strip",
     "SideType",
+    "translate",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/strings/translate.pxd
new file mode 100644
index 00000000000..0ca746801d7
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/translate.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.strings.translate cimport filter_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column translate(Column input, dict chars_table)
+
+cpdef Column filter_characters(
+    Column input,
+    dict characters_to_filter,
+    filter_type keep_characters,
+    Scalar replacement
+)
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx
new file mode 100644
index 00000000000..a62c7ec4528
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/translate.pyx
@@ -0,0 +1,122 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.pair cimport pair
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings cimport translate as cpp_translate
+from pylibcudf.libcudf.types cimport char_utf8
+from pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+from pylibcudf.libcudf.strings.translate import \
+    filter_type as FilterType  # no-cython-lint
+
+
+cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table):
+    """
+    Convert str.maketrans table to cudf compatible table.
+    """
+    cdef int table_size = len(table)
+    cdef vector[pair[char_utf8, char_utf8]] c_table
+
+    c_table.reserve(table_size)
+    for key, value in table.items():
+        if isinstance(value, int):
+            value = chr(value)
+        if isinstance(value, str):
+            value = int.from_bytes(value.encode(), byteorder='big')
+        if isinstance(key, int):
+            key = chr(key)
+        if isinstance(key, str):
+            key = int.from_bytes(key.encode(), byteorder='big')
+        c_table.push_back((key, value))
+
+    return c_table
+
+
+cpdef Column translate(Column input, dict chars_table):
+    """
+    Translates individual characters within each string.
+
+    For details, see :cpp:func:`cudf::strings::translate`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    chars_table : dict
+        Table of UTF-8 character mappings
+
+    Returns
+    -------
+    Column
+        New column with padded strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef vector[pair[char_utf8, char_utf8]] c_chars_table = _table_to_c_table(
+        chars_table
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_translate.translate(
+                input.view(),
+                c_chars_table
+            )
+        )
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column filter_characters(
+    Column input,
+    dict characters_to_filter,
+    filter_type keep_characters,
+    Scalar replacement
+):
+    """
+    Removes ranges of characters from each string in a strings column.
+
+    For details, see :cpp:func:`cudf::strings::filter_characters`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    characters_to_filter : dict
+        Table of character ranges to filter on
+
+    keep_characters : FilterType
+        If true, the `characters_to_filter` are retained
+        and all other characters are removed.
+
+    replacement : Scalar
+        Replacement string for each character removed.
+
+    Returns
+    -------
+    Column
+        New column with filtered strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef vector[pair[char_utf8, char_utf8]] c_characters_to_filter = _table_to_c_table(
+        characters_to_filter
+    )
+    cdef const string_scalar* c_replacement = <const string_scalar*>(
+        replacement.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_translate.filter_characters(
+                input.view(),
+                c_characters_to_filter,
+                keep_characters,
+                dereference(c_replacement),
+            )
+        )
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
new file mode 100644
index 00000000000..2ae893e69fb
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture
+def data_col():
+    pa_data_col = pa.array(
+        ["aa", "bbb", "cccc", "abcd", None],
+        type=pa.string(),
+    )
+    return pa_data_col, plc.interop.from_arrow(pa_data_col)
+
+
+@pytest.fixture
+def trans_table():
+    return str.maketrans("abd", "A Q")
+
+
+def test_translate(data_col, trans_table):
+    pa_array, plc_col = data_col
+    result = plc.strings.translate.translate(plc_col, trans_table)
+    expected = pa.array(
+        [
+            val.translate(trans_table) if isinstance(val, str) else None
+            for val in pa_array.to_pylist()
+        ]
+    )
+    assert_column_eq(expected, result)
+
+
+@pytest.mark.parametrize(
+    "keep",
+    [
+        plc.strings.translate.FilterType.KEEP,
+        plc.strings.translate.FilterType.REMOVE,
+    ],
+)
+def test_filter_characters(data_col, trans_table, keep):
+    pa_array, plc_col = data_col
+    result = plc.strings.translate.filter_characters(
+        plc_col, trans_table, keep, plc.interop.from_arrow(pa.scalar("*"))
+    )
+    exp_data = []
+    flat_trans = set(trans_table.keys()).union(trans_table.values())
+    for val in pa_array.to_pylist():
+        if not isinstance(val, str):
+            exp_data.append(val)
+        else:
+            new_val = ""
+            for ch in val:
+                if (
+                    ch in flat_trans
+                    and keep == plc.strings.translate.FilterType.KEEP
+                ):
+                    new_val += ch
+                elif (
+                    ch not in flat_trans
+                    and keep == plc.strings.translate.FilterType.REMOVE
+                ):
+                    new_val += ch
+                else:
+                    new_val += "*"
+            exp_data.append(new_val)
+    expected = pa.array(exp_data)
+    assert_column_eq(expected, result)

From 76cae874a6f75c741055e50ebb839620ea98c8a0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 1 Oct 2024 15:35:06 -1000
Subject: [PATCH 6/9] Add string.find_multiple APIs to pylibcudf (#16920)

Redo at https://github.com/rapidsai/cudf/pull/16824

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16920
---
 .../pylibcudf/strings/find_multiple.rst       |  6 +++
 .../api_docs/pylibcudf/strings/index.rst      |  1 +
 .../cudf/cudf/_lib/strings/find_multiple.pyx  | 27 ++++---------
 .../libcudf/strings/find_multiple.pxd         |  2 +-
 .../pylibcudf/strings/CMakeLists.txt          |  1 +
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |  1 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |  1 +
 .../pylibcudf/strings/find_multiple.pxd       |  6 +++
 .../pylibcudf/strings/find_multiple.pyx       | 39 +++++++++++++++++++
 .../tests/test_string_find_multiple.py        | 22 +++++++++++
 10 files changed, 85 insertions(+), 21 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/find_multiple.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/find_multiple.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst
new file mode 100644
index 00000000000..8e86b33b1a0
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst
@@ -0,0 +1,6 @@
+=============
+find_multiple
+=============
+
+.. automodule:: pylibcudf.strings.find_multiple
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 9b1a6b72a88..7e0d128cfb2 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -9,6 +9,7 @@ strings
     contains
     extract
     find
+    find_multiple
     findall
     regex_flags
     regex_program
diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx
index 1358f8e3c2c..39e0013769f 100644
--- a/python/cudf/cudf/_lib/strings/find_multiple.pyx
+++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx
@@ -1,18 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.find_multiple cimport (
-    find_multiple as cpp_find_multiple,
-)
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def find_multiple(Column source_strings, Column target_strings):
@@ -20,14 +13,8 @@ def find_multiple(Column source_strings, Column target_strings):
     Returns a column with character position values where each
     of the `target_strings` are found in each string of `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_find_multiple(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.find_multiple.find_multiple(
+        source_strings.to_pylibcudf(mode="read"),
+        target_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
index 0491644a10a..3d048c1f50b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
@@ -9,5 +9,5 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \
         nogil:
 
     cdef unique_ptr[column] find_multiple(
-        column_view source_strings,
+        column_view input,
         column_view targets) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 052a0cf3c56..71b1e29afcb 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -20,6 +20,7 @@ set(cython_sources
     contains.pyx
     extract.pyx
     find.pyx
+    find_multiple.pyx
     findall.pyx
     regex_flags.pyx
     regex_program.pyx
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 142637ff577..e6e6bee2750 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -9,6 +9,7 @@ from . cimport (
     convert,
     extract,
     find,
+    find_multiple,
     findall,
     regex_flags,
     regex_program,
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index decfadd63a4..7f121279969 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -9,6 +9,7 @@
     convert,
     extract,
     find,
+    find_multiple,
     findall,
     regex_flags,
     regex_program,
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd
new file mode 100644
index 00000000000..b7b3aefa336
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column find_multiple(Column input, Column targets)
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
new file mode 100644
index 00000000000..413fc1cb79d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
@@ -0,0 +1,39 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple
+
+
+cpdef Column find_multiple(Column input, Column targets):
+    """
+    Returns a lists column with character position values where each
+    of the target strings are found in each string.
+
+    For details, see :cpp:func:`cudf::strings::find_multiple`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    targets : Column
+        Strings to search for in each string
+
+    Returns
+    -------
+    Column
+        Lists column with character position values
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_find_multiple.find_multiple(
+                input.view(),
+                targets.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
new file mode 100644
index 00000000000..d6b37a388f0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_find_multiple():
+    arr = pa.array(["abc", "def"])
+    targets = pa.array(["a", "c", "e"])
+    result = plc.strings.find_multiple.find_multiple(
+        plc.interop.from_arrow(arr),
+        plc.interop.from_arrow(targets),
+    )
+    expected = pa.array(
+        [
+            [elem.find(target) for target in targets.to_pylist()]
+            for elem in arr.to_pylist()
+        ],
+        type=pa.list_(pa.int32()),
+    )
+    assert_column_eq(expected, result)

From 6c9064ad074351591f8a4ad757b4d4e32789b8e5 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 1 Oct 2024 18:42:10 -0700
Subject: [PATCH 7/9] Refactor the `cuda_memcpy` functions to make them more
 usable (#16945)

As we expanded the use of the `cuda_memcpy` functions, we realized that they are not very ergonomic, as they require caller to query `is_device_accessible` and pass the correct `PAGEABLE`/`PINNED` enum based on this.

This PR aims to make the `cuda_memcpy` functions easier to use, and the call site changes hopefully showcase this. The new implementation takes spans as parameters and relies on the `host_span::is_device_accessible` to enable copy strategies for pinned memory. Host spans set this flag during construction; creating a host span from a `cudf::detail::host_vector` will correctly propagate `is_device_accessible`. Thus, call can simply* call the `cuda_memcpy` functions with their containers as parameters and rely on implicit conversion to `host_span`/`device_span`. Bonus - there's no way to mix up host and device memory pointers :+1:

Sharp edges:
* Conversion prevents template deduction, so calls that pass containers as parameters need to specify the template parameter (see changes in this PR).
* ~The API copies the `min(input.size(), output.size())` bytes, as this is what we can do safely. This might cause surprises to users if they unintentionally pass spans of different sizes. We could instead throw in this case.~

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16945
---
 .../cudf/detail/utilities/cuda_memcpy.hpp     | 78 +++++++++++++++----
 .../detail/utilities/vector_factories.hpp     | 16 +---
 cpp/src/io/json/host_tree_algorithms.cu       | 13 +---
 cpp/src/io/utilities/hostdevice_vector.hpp    | 14 +---
 cpp/src/utilities/cuda_memcpy.cu              | 11 +--
 5 files changed, 76 insertions(+), 56 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index 632d5a732ec..4f0c52c5954 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -25,33 +26,82 @@ namespace detail {
 
 enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
 
+void cuda_memcpy_async_impl(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
 /**
- * @brief Asynchronously copies data between the host and device.
+ * @brief Asynchronously copies data from host to device memory.
  *
  * Implementation may use different strategies depending on the size and type of host data.
  *
- * @param dst Destination memory address
- * @param src Source memory address
- * @param size Number of bytes to copy
- * @param kind Type of host memory
+ * @param dst Destination device memory
+ * @param src Source host memory
  * @param stream CUDA stream used for the copy
  */
-void cuda_memcpy_async(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+template <typename T>
+void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
+  auto const is_pinned = src.is_device_accessible();
+  cuda_memcpy_async_impl(dst.data(),
+                         src.data(),
+                         src.size_bytes(),
+                         is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                         stream);
+}
 
 /**
- * @brief Synchronously copies data between the host and device.
+ * @brief Asynchronously copies data from device to host memory.
  *
  * Implementation may use different strategies depending on the size and type of host data.
  *
- * @param dst Destination memory address
- * @param src Source memory address
- * @param size Number of bytes to copy
- * @param kind Type of host memory
+ * @param dst Destination host memory
+ * @param src Source device memory
  * @param stream CUDA stream used for the copy
  */
-void cuda_memcpy(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+template <typename T>
+void cuda_memcpy_async(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
+  auto const is_pinned = dst.is_device_accessible();
+  cuda_memcpy_async_impl(dst.data(),
+                         src.data(),
+                         src.size_bytes(),
+                         is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                         stream);
+}
+
+/**
+ * @brief Synchronously copies data from host to device memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination device memory
+ * @param src Source host memory
+ * @param stream CUDA stream used for the copy
+ */
+template <typename T>
+void cuda_memcpy(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, stream);
+  stream.synchronize();
+}
+
+/**
+ * @brief Synchronously copies data from device to host memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination host memory
+ * @param src Source device memory
+ * @param stream CUDA stream used for the copy
+ */
+template <typename T>
+void cuda_memcpy(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, stream);
+  stream.synchronize();
+}
 
 }  // namespace detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 953ae5b9308..1f1e7a2db77 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -101,12 +101,7 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  auto const is_pinned = source_data.is_device_accessible();
-  cuda_memcpy_async(ret.data(),
-                    source_data.data(),
-                    source_data.size() * sizeof(T),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
+  cuda_memcpy_async<T>(ret, source_data, stream);
   return ret;
 }
 
@@ -405,13 +400,8 @@ host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str
 template <typename T>
 host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  auto result          = make_host_vector<T>(v.size(), stream);
-  auto const is_pinned = result.get_allocator().is_device_accessible();
-  cuda_memcpy_async(result.data(),
-                    v.data(),
-                    v.size() * sizeof(T),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
+  auto result = make_host_vector<T>(v.size(), stream);
+  cuda_memcpy_async<T>(result, v, stream);
   return result;
 }
 
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 5855f1b5a5f..f7e8134b68d 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -634,11 +634,8 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
           is_mixed_type_column[this_col_id] == 1)
         column_categories[this_col_id] = NC_STR;
     }
-    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                    column_categories.data(),
-                                    column_categories.size() * sizeof(column_categories[0]),
-                                    cudf::detail::host_memory_kind::PAGEABLE,
-                                    stream);
+    cudf::detail::cuda_memcpy_async<NodeT>(
+      d_column_tree.node_categories, column_categories, stream);
   }
 
   // ignore all children of columns forced as string
@@ -653,11 +650,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
         forced_as_string_column[this_col_id])
       column_categories[this_col_id] = NC_STR;
   }
-  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                  column_categories.data(),
-                                  column_categories.size() * sizeof(column_categories[0]),
-                                  cudf::detail::host_memory_kind::PAGEABLE,
-                                  stream);
+  cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, column_categories, stream);
 
   // restore unique_col_ids order
   std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index aed745c42dd..634e6d78ebc 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -125,23 +125,17 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+    cuda_memcpy_async<T>(d_data, h_data, stream);
   }
 
-  void host_to_device_sync(rmm::cuda_stream_view stream)
-  {
-    cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
-  }
+  void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(d_data, h_data, stream); }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+    cuda_memcpy_async<T>(h_data, d_data, stream);
   }
 
-  void device_to_host_sync(rmm::cuda_stream_view stream)
-  {
-    cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
-  }
+  void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(h_data, d_data, stream); }
 
   /**
    * @brief Converts a hostdevice_vector into a hostdevice_span.
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 0efb881eb3e..c0af27a1748 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -30,7 +30,7 @@ namespace cudf::detail {
 namespace {
 
 // Simple kernel to copy between device buffers
-CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n)
+CUDF_KERNEL void copy_kernel(char const* __restrict__ src, char* __restrict__ dst, size_t n)
 {
   auto const idx = cudf::detail::grid_1d::global_thread_id();
   if (idx < n) { dst[idx] = src[idx]; }
@@ -61,7 +61,7 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea
 
 };  // namespace
 
-void cuda_memcpy_async(
+void cuda_memcpy_async_impl(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
 {
   if (kind == host_memory_kind::PINNED) {
@@ -73,11 +73,4 @@ void cuda_memcpy_async(
   }
 }
 
-void cuda_memcpy(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
-{
-  cuda_memcpy_async(dst, src, size, kind, stream);
-  stream.synchronize();
-}
-
 }  // namespace cudf::detail

From bac81cb8f4c61c9a81e30e79d03c323406bf657a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 1 Oct 2024 15:54:05 -1000
Subject: [PATCH 8/9] Add string.split APIs to pylibcudf (#16940)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Includes `split/split.pxd` and `split/partition.pxd`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16940
---
 .../api_docs/pylibcudf/strings/index.rst      |   1 +
 .../api_docs/pylibcudf/strings/split.rst      |   6 +
 .../cudf/_lib/strings/split/partition.pyx     |  59 +---
 python/cudf/cudf/_lib/strings/split/split.pyx | 217 +++---------
 python/cudf/cudf/core/column/string.py        |  12 +-
 .../libcudf/strings/split/partition.pxd       |   4 +-
 .../pylibcudf/libcudf/strings/split/split.pxd |  24 +-
 .../pylibcudf/strings/CMakeLists.txt          |   1 +
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |   2 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |   2 +
 .../pylibcudf/strings/split/CMakeLists.txt    |  22 ++
 .../pylibcudf/strings/split/__init__.pxd      |   2 +
 .../pylibcudf/strings/split/__init__.py       |   2 +
 .../pylibcudf/strings/split/partition.pxd     |  10 +
 .../pylibcudf/strings/split/partition.pyx     |  95 +++++
 .../pylibcudf/strings/split/split.pxd         |  24 ++
 .../pylibcudf/strings/split/split.pyx         | 326 ++++++++++++++++++
 .../tests/test_string_split_partition.py      |  43 +++
 .../tests/test_string_split_split.py          | 130 +++++++
 19 files changed, 750 insertions(+), 232 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/__init__.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/__init__.py
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/partition.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/partition.pyx
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/split.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/split.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_split_split.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 7e0d128cfb2..e73ea3370ec 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -16,4 +16,5 @@ strings
     repeat
     replace
     slice
+    split
     strip
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst
new file mode 100644
index 00000000000..cba96e86f45
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst
@@ -0,0 +1,6 @@
+=====
+split
+=====
+
+.. automodule:: pylibcudf.strings.split
+   :members:
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
index a81fb18e752..5319addc41c 100644
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ b/python/cudf/cudf/_lib/strings/split/partition.pyx
@@ -1,21 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.split.partition cimport (
-    partition as cpp_partition,
-    rpartition as cpp_rpartition,
-)
-from pylibcudf.libcudf.table.table cimport table
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -25,25 +14,11 @@ def partition(Column source_strings,
     Returns data by splitting the `source_strings`
     column at the first occurrence of the specified `py_delimiter`.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_partition(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.partition.partition(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -53,22 +28,8 @@ def rpartition(Column source_strings,
     Returns a Column by splitting the `source_strings`
     column at the last occurrence of the specified `py_delimiter`.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rpartition(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.partition.rpartition(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
index f481fea4c51..4ec6c7073d8 100644
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ b/python/cudf/cudf/_lib/strings/split/split.pyx
@@ -1,33 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.strings.split.split cimport (
-    rsplit as cpp_rsplit,
-    rsplit_re as cpp_rsplit_re,
-    rsplit_record as cpp_rsplit_record,
-    rsplit_record_re as cpp_rsplit_record_re,
-    split as cpp_split,
-    split_re as cpp_split_re,
-    split_record as cpp_split_record,
-    split_record_re as cpp_split_record_re,
-)
-from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -39,26 +18,12 @@ def split(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from beginning.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_split(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.split(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -70,25 +35,12 @@ def split_record(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from beginning.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_split_record(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.split_record(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -100,26 +52,12 @@ def rsplit(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from the end.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rsplit(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.rsplit(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -131,25 +69,12 @@ def rsplit_record(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from the end.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rsplit_record(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.rsplit_record(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -160,24 +85,15 @@ def split_re(Column source_strings,
     Returns data by splitting the `source_strings`
     column around the delimiters identified by `pattern`.
     """
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_split_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.split_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -189,24 +105,15 @@ def rsplit_re(Column source_strings,
     column around the delimiters identified by `pattern`.
     The delimiters are searched starting from the end of each string.
     """
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_rsplit_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.rsplit_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -217,23 +124,15 @@ def split_record_re(Column source_strings,
     Returns a Column by splitting the `source_strings`
     column around the delimiters identified by `pattern`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_split_record_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.split_record_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings,
     column around the delimiters identified by `pattern`.
     The delimiters are searched starting from the end of each string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_rsplit_record_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.rsplit_record_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 4463e3280df..da422db5eae 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -2546,9 +2546,9 @@ def split(
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data, _ = libstrings.split_re(self._column, pat, n)
+                    data = libstrings.split_re(self._column, pat, n)
                 else:
-                    data, _ = libstrings.split(
+                    data = libstrings.split(
                         self._column, cudf.Scalar(pat, "str"), n
                     )
                 if len(data) == 1 and data[0].null_count == len(self._column):
@@ -2719,9 +2719,9 @@ def rsplit(
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data, _ = libstrings.rsplit_re(self._column, pat, n)
+                    data = libstrings.rsplit_re(self._column, pat, n)
                 else:
-                    data, _ = libstrings.rsplit(
+                    data = libstrings.rsplit(
                         self._column, cudf.Scalar(pat, "str"), n
                     )
                 if len(data) == 1 and data[0].null_count == len(self._column):
@@ -2820,7 +2820,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.partition(self._column, cudf.Scalar(sep, "str"))[0],
+            libstrings.partition(self._column, cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -2885,7 +2885,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.rpartition(self._column, cudf.Scalar(sep, "str"))[0],
+            libstrings.rpartition(self._column, cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
index 4162e886a7d..4299cf62e99 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
@@ -12,9 +12,9 @@ cdef extern from "cudf/strings/split/partition.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[table] partition(
-        column_view source_strings,
+        column_view input,
         string_scalar delimiter) except +
 
     cdef unique_ptr[table] rpartition(
-        column_view source_strings,
+        column_view input,
         string_scalar delimiter) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
index 3046149aebb..a22a79fc7d7 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
@@ -14,22 +14,22 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[table] split(
-        column_view source_strings,
+        column_view strings_column,
         string_scalar delimiter,
         size_type maxsplit) except +
 
     cdef unique_ptr[table] rsplit(
-        column_view source_strings,
+        column_view strings_column,
         string_scalar delimiter,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] split_record(
-        column_view source_strings,
+        column_view strings,
         string_scalar delimiter,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] rsplit_record(
-        column_view source_strings,
+        column_view strings,
         string_scalar delimiter,
         size_type maxsplit) except +
 
@@ -38,21 +38,21 @@ cdef extern from "cudf/strings/split/split_re.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[table] split_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
 
     cdef unique_ptr[table] rsplit_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] split_record_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] rsplit_record_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 71b1e29afcb..d92f806efbe 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -40,3 +40,4 @@ rapids_cython_create_modules(
 )
 
 add_subdirectory(convert)
+add_subdirectory(split)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index e6e6bee2750..788e2c99ab1 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -15,6 +15,7 @@ from . cimport (
     regex_program,
     replace,
     slice,
+    split,
     strip,
     translate,
 )
@@ -35,6 +36,7 @@ __all__ = [
     "replace",
     "slice",
     "strip",
+    "split",
     "side_type",
     "translate",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index 7f121279969..bcaeb073d0b 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -16,6 +16,7 @@
     repeat,
     replace,
     slice,
+    split,
     strip,
     translate,
 )
@@ -36,6 +37,7 @@
     "replace",
     "slice",
     "strip",
+    "split",
     "SideType",
     "translate",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt
new file mode 100644
index 00000000000..8f544f6f537
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt
@@ -0,0 +1,22 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources partition.pyx split.pyx)
+
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.pxd b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd
new file mode 100644
index 00000000000..72086e57d9f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from . cimport partition, split
diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/strings/split/__init__.py
new file mode 100644
index 00000000000..2033e5e275b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from . import partition, split
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/strings/split/partition.pxd
new file mode 100644
index 00000000000..c18257a4787
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.table cimport Table
+
+
+cpdef Table partition(Column input, Scalar delimiter=*)
+
+cpdef Table rpartition(Column input, Scalar delimiter=*)
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
new file mode 100644
index 00000000000..ecc959e65b0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
@@ -0,0 +1,95 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings.split cimport partition as cpp_partition
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.table cimport Table
+
+from cython.operator import dereference
+
+
+cpdef Table partition(Column input, Scalar delimiter=None):
+    """
+    Returns a set of 3 columns by splitting each string using the
+    specified delimiter.
+
+    For details, see :cpp:func:`cudf::strings::partition`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating where to split each string.
+
+    Returns
+    -------
+    Table
+        New table of strings columns
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = move(
+            cpp_partition.partition(
+                input.view(),
+                dereference(c_delimiter)
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Table rpartition(Column input, Scalar delimiter=None):
+    """
+    Returns a set of 3 columns by splitting each string using the
+    specified delimiter starting from the end of each string.
+
+    For details, see :cpp:func:`cudf::strings::rpartition`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating where to split each string.
+
+    Returns
+    -------
+    Table
+       New strings columns
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = move(
+            cpp_partition.rpartition(
+                input.view(),
+                dereference(c_delimiter)
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/strings/split/split.pxd
new file mode 100644
index 00000000000..355a1874298
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+
+cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit)
+
+cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit)
+
+cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit)
+
+cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit)
+
+cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit)
+
+cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit)
+
+cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit)
+
+cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit)
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx
new file mode 100644
index 00000000000..a7d7f39fc47
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx
@@ -0,0 +1,326 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.split cimport split as cpp_split
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+from cython.operator import dereference
+
+
+cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit):
+    """
+    Returns a list of columns by splitting each string using the
+    specified delimiter.
+
+    For details, see :cpp:func:`cudf::strings::split`.
+
+    Parameters
+    ----------
+    strings_column : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating the split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        New table of strings columns
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_split.split(
+                strings_column.view(),
+                dereference(c_delimiter),
+                maxsplit,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+
+cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit):
+    """
+    Returns a list of columns by splitting each string using the
+    specified delimiter starting from the end of each string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit`.
+
+    Parameters
+    ----------
+    strings_column : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating the split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        New table of strings columns.
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_split.rsplit(
+                strings_column.view(),
+                dereference(c_delimiter),
+                maxsplit,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit):
+    """
+    Splits individual strings elements into a list of strings.
+
+    For details, see :cpp:func:`cudf::strings::split_record`.
+
+    Parameters
+    ----------
+    strings : Column
+        A column of string elements to be split.
+
+    delimiter : Scalar
+        The string to identify split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_split.split_record(
+                strings.view(),
+                dereference(c_delimiter),
+                maxsplit,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit):
+    """
+    Splits individual strings elements into a list of strings starting
+    from the end of each string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit_record`.
+
+    Parameters
+    ----------
+    strings : Column
+        A column of string elements to be split.
+
+    delimiter : Scalar
+        The string to identify split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_split.rsplit_record(
+                strings.view(),
+                dereference(c_delimiter),
+                maxsplit,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a table of strings columns
+    using a regex_program's pattern to delimit each string.
+
+    For details, see :cpp:func:`cudf::strings::split_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        A table of columns of strings.
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_split.split_re(
+                input.view(),
+                prog.c_obj.get()[0],
+                maxsplit,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a table of strings columns
+    using a regex_program's pattern to delimit each string starting from
+    the end of the string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        A table of columns of strings.
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_split.rsplit_re(
+                input.view(),
+                prog.c_obj.get()[0],
+                maxsplit,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a list column of strings using the given
+    regex_program to delimit each string.
+
+    For details, see :cpp:func:`cudf::strings::split_record_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_split.split_record_re(
+                input.view(),
+                prog.c_obj.get()[0],
+                maxsplit,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a list column of strings using the given
+    regex_program to delimit each string starting from the end of the string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit_record_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_split.rsplit_record_re(
+                input.view(),
+                prog.c_obj.get()[0],
+                maxsplit,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
new file mode 100644
index 00000000000..80cae8d1c6b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_table_eq
+
+
+@pytest.fixture
+def data_col():
+    pa_arr = pa.array(["ab_cd", "def_g_h", None])
+    plc_column = plc.interop.from_arrow(pa_arr)
+    return pa_arr, plc_column
+
+
+def test_partition(data_col):
+    pa_arr, plc_column = data_col
+    result = plc.strings.split.partition.partition(
+        plc_column, plc.interop.from_arrow(pa.scalar("_"))
+    )
+    expected = pa.table(
+        {
+            "a": ["ab", "def", None],
+            "b": ["_", "_", None],
+            "c": ["cd", "g_h", None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_rpartition(data_col):
+    pa_arr, plc_column = data_col
+    result = plc.strings.split.partition.rpartition(
+        plc_column, plc.interop.from_arrow(pa.scalar("_"))
+    )
+    expected = pa.table(
+        {
+            "a": ["ab", "def_g", None],
+            "b": ["_", "_", None],
+            "c": ["cd", "h", None],
+        }
+    )
+    assert_table_eq(expected, result)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
new file mode 100644
index 00000000000..2aeffac8209
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq, assert_table_eq
+
+
+@pytest.fixture
+def data_col():
+    pa_array = pa.array(["a_b_c", "d-e-f", None])
+    plc_column = plc.interop.from_arrow(pa_array)
+    return pa_array, plc_column
+
+
+@pytest.fixture
+def delimiter():
+    delimiter = "_"
+    plc_delimiter = plc.interop.from_arrow(pa.scalar(delimiter))
+    return delimiter, plc_delimiter
+
+
+@pytest.fixture
+def re_delimiter():
+    return "[_-]"
+
+
+def test_split(data_col, delimiter):
+    _, plc_column = data_col
+    _, plc_delimiter = delimiter
+    result = plc.strings.split.split.split(plc_column, plc_delimiter, 1)
+    expected = pa.table(
+        {
+            "a": ["a", "d-e-f", None],
+            "b": ["b_c", None, None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_rsplit(data_col, delimiter):
+    _, plc_column = data_col
+    _, plc_delimiter = delimiter
+    result = plc.strings.split.split.rsplit(plc_column, plc_delimiter, 1)
+    expected = pa.table(
+        {
+            "a": ["a_b", "d-e-f", None],
+            "b": ["c", None, None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_split_record(data_col, delimiter):
+    pa_array, plc_column = data_col
+    delim, plc_delim = delimiter
+    result = plc.strings.split.split.split_record(plc_column, plc_delim, 1)
+    expected = pc.split_pattern(pa_array, delim, max_splits=1)
+    assert_column_eq(expected, result)
+
+
+def test_rsplit_record(data_col, delimiter):
+    pa_array, plc_column = data_col
+    delim, plc_delim = delimiter
+    result = plc.strings.split.split.split_record(plc_column, plc_delim, 1)
+    expected = pc.split_pattern(pa_array, delim, max_splits=1)
+    assert_column_eq(expected, result)
+
+
+def test_split_re(data_col, re_delimiter):
+    _, plc_column = data_col
+    result = plc.strings.split.split.split_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        1,
+    )
+    expected = pa.table(
+        {
+            "a": ["a", "d", None],
+            "b": ["b_c", "e-f", None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_rsplit_re(data_col, re_delimiter):
+    _, plc_column = data_col
+    result = plc.strings.split.split.rsplit_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        1,
+    )
+    expected = pa.table(
+        {
+            "a": ["a_b", "d-e", None],
+            "b": ["c", "f", None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_split_record_re(data_col, re_delimiter):
+    pa_array, plc_column = data_col
+    result = plc.strings.split.split.split_record_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        1,
+    )
+    expected = pc.split_pattern_regex(pa_array, re_delimiter, max_splits=1)
+    assert_column_eq(expected, result)
+
+
+def test_rsplit_record_re(data_col, re_delimiter):
+    pa_array, plc_column = data_col
+    result = plc.strings.split.split.rsplit_record_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        -1,
+    )
+    expected = pc.split_pattern_regex(pa_array, re_delimiter)
+    assert_column_eq(expected, result)

From a6ca0f0068995e5080e1c8d04410a2a1b9dc8b37 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Wed, 2 Oct 2024 10:09:16 -0400
Subject: [PATCH 9/9] Use nvcomp wheel instead of bundling nvcomp (#16946)

Contributes to https://github.com/rapidsai/rapids-wheels-planning/issues/74

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16946
---
 ci/build_wheel_libcudf.sh             |  6 +++++-
 cpp/cmake/thirdparty/get_nvcomp.cmake |  8 ++------
 dependencies.yaml                     | 28 ++++++++++++++++++++++++++-
 python/libcudf/CMakeLists.txt         | 15 +++++++++-----
 python/libcudf/pyproject.toml         |  3 +++
 5 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index 8975381ceba..91bc071583e 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -5,11 +5,15 @@ set -euo pipefail
 
 package_dir="python/libcudf"
 
+export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 ./ci/build_wheel.sh ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
 mkdir -p ${package_dir}/final_dist
-python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
+python -m auditwheel repair \
+    --exclude libnvcomp.so.4 \
+    -w ${package_dir}/final_dist \
+    ${package_dir}/dist/*
 
 RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake
index 41bbf44abc8..1b6a1730161 100644
--- a/cpp/cmake/thirdparty/get_nvcomp.cmake
+++ b/cpp/cmake/thirdparty/get_nvcomp.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -16,11 +16,7 @@
 function(find_and_configure_nvcomp)
 
   include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  rapids_cpm_nvcomp(
-    BUILD_EXPORT_SET cudf-exports
-    INSTALL_EXPORT_SET cudf-exports
-    USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP}
-  )
+  rapids_cpm_nvcomp(USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP})
 
   # Per-thread default stream
   if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM)
diff --git a/dependencies.yaml b/dependencies.yaml
index ed36a23e5c3..b192158c4ea 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -15,6 +15,7 @@ files:
       - depends_on_cupy
       - depends_on_libkvikio
       - depends_on_librmm
+      - depends_on_nvcomp
       - depends_on_rmm
       - develop
       - docs
@@ -152,6 +153,13 @@ files:
       - build_cpp
       - depends_on_libkvikio
       - depends_on_librmm
+  py_run_libcudf:
+    output: pyproject
+    pyproject_dir: python/libcudf
+    extras:
+      table: project
+    includes:
+      - depends_on_nvcomp
   py_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
@@ -367,9 +375,27 @@ dependencies:
           - fmt>=11.0.2,<12
           - flatbuffers==24.3.25
           - librdkafka>=2.5.0,<2.6.0a0
+          - spdlog>=1.14.1,<1.15
+  depends_on_nvcomp:
+    common:
+      - output_types: conda
+        packages:
           # Align nvcomp version with rapids-cmake
           - nvcomp==4.0.1
-          - spdlog>=1.14.1,<1.15
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - nvidia-nvcomp-cu12==4.0.1
+          - matrix:
+              cuda: "11.*"
+            packages:
+              - nvidia-nvcomp-cu11==4.0.1
+          - matrix:
+            packages:
+              - nvidia-nvcomp==4.0.1
   rapids_build_skbuild:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 0a8f5c4807d..2b208e2e021 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -22,6 +22,8 @@ project(
   LANGUAGES CXX
 )
 
+option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF)
+
 # Check if cudf is already available. If so, it is the user's responsibility to ensure that the
 # CMake package is also available at build time of the Python cudf package.
 find_package(cudf "${RAPIDS_VERSION}")
@@ -45,8 +47,11 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 
 add_subdirectory(../../cpp cudf-cpp)
 
-# Ensure other libraries needed by libcudf.so get installed alongside it.
-include(cmake/Modules/WheelHelpers.cmake)
-install_aliased_imported_targets(
-  TARGETS cudf nvcomp::nvcomp DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
-)
+if(USE_NVCOMP_RUNTIME_WHEEL)
+  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
+  set_property(
+    TARGET cudf
+    PROPERTY INSTALL_RPATH ${rpaths}
+    APPEND
+  )
+endif()
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 5bffe9fd96c..84660cbc276 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -37,6 +37,9 @@ classifiers = [
     "Programming Language :: C++",
     "Environment :: GPU :: NVIDIA CUDA",
 ]
+dependencies = [
+    "nvidia-nvcomp==4.0.1",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"