diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index ecf2f610697..de53e7586cd 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -18,11 +18,11 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/table/table_device_view.cuh>
+#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/traits.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -30,8 +30,17 @@
 
 #include <thrust/fill.h>
 
+#include <type_traits>
+#include <vector>
+
 namespace cudf {
 namespace detail {
+template <typename T>
+constexpr bool is_product_supported()
+{
+  return is_numeric<T>();
+}
+
 /**
  * @brief Maps an `aggregation::Kind` value to it's corresponding binary
  * operator.
@@ -113,465 +122,6 @@ constexpr bool has_corresponding_operator()
   return !std::is_same_v<typename corresponding_operator<k>::type, void>;
 }
 
-template <typename Source,
-          aggregation::Kind k,
-          bool target_has_nulls,
-          bool source_has_nulls,
-          typename Enable = void>
-struct update_target_element {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !is_fixed_point<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::MIN>;
-    cudf::detail::atomic_min(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target       = target_type_t<Source, aggregation::MIN>;
-    using DeviceTarget = device_storage_type_t<Target>;
-    using DeviceSource = device_storage_type_t<Source>;
-
-    cudf::detail::atomic_min(&target.element<DeviceTarget>(target_index),
-                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !is_fixed_point<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::MAX>;
-    cudf::detail::atomic_max(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target       = target_type_t<Source, aggregation::MAX>;
-    using DeviceTarget = device_storage_type_t<Target>;
-    using DeviceSource = device_storage_type_t<Source>;
-
-    cudf::detail::atomic_max(&target.element<DeviceTarget>(target_index),
-                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::SUM>;
-    cudf::detail::atomic_add(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target       = target_type_t<Source, aggregation::SUM>;
-    using DeviceTarget = device_storage_type_t<Target>;
-    using DeviceSource = device_storage_type_t<Source>;
-
-    cudf::detail::atomic_add(&target.element<DeviceTarget>(target_index),
-                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-/**
- * @brief Function object to update a single element in a target column using
- * the dictionary key addressed by the specific index.
- *
- * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a
- * dictionary.
- *
- */
-template <bool target_has_nulls = true>
-struct update_target_from_dictionary {
-  template <typename Source,
-            aggregation::Kind k,
-            std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    update_target_element<Source, k, target_has_nulls, false>{}(
-      target, target_index, source, source_index);
-  }
-  template <typename Source,
-            aggregation::Kind k,
-            std::enable_if_t<is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-  }
-};
-
-/**
- * @brief Specialization function for dictionary type and aggregations.
- *
- * The `source` column is a dictionary type. This functor de-references the
- * dictionary's keys child column and maps the input source index through
- * the dictionary's indices child column to pass to the `update_target_element`
- * in the above `update_target_from_dictionary` using the type-dispatcher to
- * resolve the keys column type.
- *
- * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )`
- *
- * @tparam target_has_nulls Indicates presence of null elements in `target`
- * @tparam source_has_nulls Indicates presence of null elements in `source`.
- */
-template <aggregation::Kind k, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  dictionary32,
-  k,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
-                       k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    dispatch_type_and_aggregation(
-      source.child(cudf::dictionary_column_view::keys_column_index).type(),
-      k,
-      update_target_from_dictionary<target_has_nulls>{},
-      target,
-      target_index,
-      source.child(cudf::dictionary_column_view::keys_column_index),
-      static_cast<cudf::size_type>(source.element<dictionary32>(source_index)));
-  }
-};
-
-template <typename T>
-constexpr bool is_product_supported()
-{
-  return is_numeric<T>();
-}
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::SUM_OF_SQUARES,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_product_supported<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
-    auto value   = static_cast<Target>(source.element<Source>(source_index));
-    cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::PRODUCT,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_product_supported<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::PRODUCT>;
-    cudf::detail::atomic_mul(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::COUNT_VALID,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_valid_aggregation<Source, aggregation::COUNT_VALID>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::COUNT_VALID>;
-    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
-
-    // It is assumed the output for COUNT_VALID is initialized to be all valid
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::COUNT_ALL,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_valid_aggregation<Source, aggregation::COUNT_ALL>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    using Target = target_type_t<Source, aggregation::COUNT_ALL>;
-    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
-
-    // It is assumed the output for COUNT_ALL is initialized to be all valid
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::ARGMAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_valid_aggregation<Source, aggregation::ARGMAX>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::ARGMAX>;
-    auto old     = cudf::detail::atomic_cas(
-      &target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
-    if (old != ARGMAX_SENTINEL) {
-      while (source.element<Source>(source_index) > source.element<Source>(old)) {
-        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
-      }
-    }
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::ARGMIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_valid_aggregation<Source, aggregation::ARGMIN>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::ARGMIN>;
-    auto old     = cudf::detail::atomic_cas(
-      &target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
-    if (old != ARGMIN_SENTINEL) {
-      while (source.element<Source>(source_index) < source.element<Source>(old)) {
-        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
-      }
-    }
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-/**
- * @brief Function object to update a single element in a target column by
- * performing an aggregation operation with a single element from a source
- * column.
- *
- * @tparam target_has_nulls Indicates presence of null elements in `target`
- * @tparam source_has_nulls Indicates presence of null elements in `source`.
- */
-template <bool target_has_nulls = true, bool source_has_nulls = true>
-struct elementwise_aggregator {
-  template <typename Source, aggregation::Kind k>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    update_target_element<Source, k, target_has_nulls, source_has_nulls>{}(
-      target, target_index, source, source_index);
-  }
-};
-
-/**
- * @brief Updates a row in `target` by performing elementwise aggregation
- * operations with a row in `source`.
- *
- * For the row in `target` specified by `target_index`, each element at `i` is
- * updated by:
- * ```c++
- * target_row[i] = aggs[i](target_row[i], source_row[i])
- * ```
- *
- * This function only supports aggregations that can be done in a "single pass",
- * i.e., given an initial value `R`, the aggregation `op` can be computed on a series
- * of elements `e[i] for i in [0,n)` by computing `R = op(e[i],R)` for any order
- * of the values of `i`.
- *
- * The initial value and validity of `R` depends on the aggregation:
- * SUM: 0 and NULL
- * MIN: Max value of type and NULL
- * MAX: Min value of type and NULL
- * COUNT_VALID: 0 and VALID
- * COUNT_ALL:   0 and VALID
- * ARGMAX: `ARGMAX_SENTINEL` and NULL
- * ARGMIN: `ARGMIN_SENTINEL` and NULL
- *
- * It is required that the elements of `target` be initialized with the corresponding
- * initial values and validity specified above.
- *
- * Handling of null elements in both `source` and `target` depends on the aggregation:
- * SUM, MIN, MAX, ARGMIN, ARGMAX:
- *  - `source`: Skipped
- *  - `target`: Updated from null to valid upon first successful aggregation
- * COUNT_VALID, COUNT_ALL:
- *  - `source`: Skipped
- *  - `target`: Cannot be null
- *
- * @param target Table containing the row to update
- * @param target_index Index of the row to update in `target`
- * @param source Table containing the row used to update the row in `target`.
- * The invariant `source.num_columns() >= target.num_columns()` must hold.
- * @param source_index Index of the row to use in `source`
- * @param aggs Array of aggregations to perform between elements of the `target`
- * and `source` rows. Must contain at least `target.num_columns()` valid
- * `aggregation::Kind` values.
- */
-template <bool target_has_nulls = true, bool source_has_nulls = true>
-__device__ inline void aggregate_row(mutable_table_device_view target,
-                                     size_type target_index,
-                                     table_device_view source,
-                                     size_type source_index,
-                                     aggregation::Kind const* aggs)
-{
-  for (auto i = 0; i < target.num_columns(); ++i) {
-    dispatch_type_and_aggregation(source.column(i).type(),
-                                  aggs[i],
-                                  elementwise_aggregator<target_has_nulls, source_has_nulls>{},
-                                  target.column(i),
-                                  target_index,
-                                  source.column(i),
-                                  source_index);
-  }
-}
-
 /**
  * @brief Dispatched functor to initialize a column with the identity of an
  * aggregation operation.
diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
new file mode 100644
index 00000000000..10be5e1d36f
--- /dev/null
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/traits.cuh>
+
+#include <cuda/std/type_traits>
+
+namespace cudf::detail {
+template <typename Source, aggregation::Kind k, typename Enable = void>
+struct update_target_element {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MIN,
+  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !is_fixed_point<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::MIN>;
+    cudf::detail::atomic_min(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MIN,
+  cuda::std::enable_if_t<is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target       = target_type_t<Source, aggregation::MIN>;
+    using DeviceTarget = device_storage_type_t<Target>;
+    using DeviceSource = device_storage_type_t<Source>;
+
+    cudf::detail::atomic_min(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MAX,
+  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !is_fixed_point<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::MAX>;
+    cudf::detail::atomic_max(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MAX,
+  cuda::std::enable_if_t<is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target       = target_type_t<Source, aggregation::MAX>;
+    using DeviceTarget = device_storage_type_t<Target>;
+    using DeviceSource = device_storage_type_t<Source>;
+
+    cudf::detail::atomic_max(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::SUM,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::SUM>;
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::SUM,
+  cuda::std::enable_if_t<is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target       = target_type_t<Source, aggregation::SUM>;
+    using DeviceTarget = device_storage_type_t<Target>;
+    using DeviceSource = device_storage_type_t<Source>;
+
+    cudf::detail::atomic_add(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+/**
+ * @brief Function object to update a single element in a target column using
+ * the dictionary key addressed by the specific index.
+ *
+ * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a
+ * dictionary.
+ *
+ */
+struct update_target_from_dictionary {
+  template <typename Source,
+            aggregation::Kind k,
+            cuda::std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    update_target_element<Source, k>{}(target, target_index, source, source_index);
+  }
+  template <typename Source,
+            aggregation::Kind k,
+            cuda::std::enable_if_t<is_dictionary<Source>()>* = nullptr>
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+  }
+};
+
+/**
+ * @brief Specialization function for dictionary type and aggregations.
+ *
+ * The `source` column is a dictionary type. This functor de-references the
+ * dictionary's keys child column and maps the input source index through
+ * the dictionary's indices child column to pass to the `update_target_element`
+ * in the above `update_target_from_dictionary` using the type-dispatcher to
+ * resolve the keys column type.
+ *
+ * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )`
+ */
+template <aggregation::Kind k>
+struct update_target_element<
+  dictionary32,
+  k,
+  cuda::std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
+                             k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    dispatch_type_and_aggregation(
+      source.child(cudf::dictionary_column_view::keys_column_index).type(),
+      k,
+      update_target_from_dictionary{},
+      target,
+      target_index,
+      source.child(cudf::dictionary_column_view::keys_column_index),
+      static_cast<cudf::size_type>(source.element<dictionary32>(source_index)));
+  }
+};
+
+template <typename Source>
+struct update_target_element<Source,
+                             aggregation::SUM_OF_SQUARES,
+                             cuda::std::enable_if_t<is_product_supported<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
+    auto value   = static_cast<Target>(source.element<Source>(source_index));
+    cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<Source,
+                             aggregation::PRODUCT,
+                             cuda::std::enable_if_t<is_product_supported<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::PRODUCT>;
+    cudf::detail::atomic_mul(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::COUNT_VALID,
+  cuda::std::enable_if_t<is_valid_aggregation<Source, aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::COUNT_VALID>;
+    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
+
+    // It is assumed the output for COUNT_VALID is initialized to be all valid
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::COUNT_ALL,
+  cuda::std::enable_if_t<is_valid_aggregation<Source, aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    using Target = target_type_t<Source, aggregation::COUNT_ALL>;
+    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
+
+    // It is assumed the output for COUNT_ALL is initialized to be all valid
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::ARGMAX,
+  cuda::std::enable_if_t<is_valid_aggregation<Source, aggregation::ARGMAX>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::ARGMAX>;
+    auto old     = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
+    if (old != ARGMAX_SENTINEL) {
+      while (source.element<Source>(source_index) > source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
+      }
+    }
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::ARGMIN,
+  cuda::std::enable_if_t<is_valid_aggregation<Source, aggregation::ARGMIN>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::ARGMIN>;
+    auto old     = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
+    if (old != ARGMIN_SENTINEL) {
+      while (source.element<Source>(source_index) < source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
+      }
+    }
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+/**
+ * @brief Function object to update a single element in a target column by
+ * performing an aggregation operation with a single element from a source
+ * column.
+ */
+struct elementwise_aggregator {
+  template <typename Source, aggregation::Kind k>
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    update_target_element<Source, k>{}(target, target_index, source, source_index);
+  }
+};
+
+/**
+ * @brief Updates a row in `target` by performing elementwise aggregation
+ * operations with a row in `source`.
+ *
+ * For the row in `target` specified by `target_index`, each element at `i` is
+ * updated by:
+ * ```c++
+ * target_row[i] = aggs[i](target_row[i], source_row[i])
+ * ```
+ *
+ * This function only supports aggregations that can be done in a "single pass",
+ * i.e., given an initial value `R`, the aggregation `op` can be computed on a series
+ * of elements `e[i] for i in [0,n)` by computing `R = op(e[i],R)` for any order
+ * of the values of `i`.
+ *
+ * The initial value and validity of `R` depends on the aggregation:
+ * SUM: 0 and NULL
+ * MIN: Max value of type and NULL
+ * MAX: Min value of type and NULL
+ * COUNT_VALID: 0 and VALID
+ * COUNT_ALL:   0 and VALID
+ * ARGMAX: `ARGMAX_SENTINEL` and NULL
+ * ARGMIN: `ARGMIN_SENTINEL` and NULL
+ *
+ * It is required that the elements of `target` be initialized with the corresponding
+ * initial values and validity specified above.
+ *
+ * Handling of null elements in both `source` and `target` depends on the aggregation:
+ * SUM, MIN, MAX, ARGMIN, ARGMAX:
+ *  - `source`: Skipped
+ *  - `target`: Updated from null to valid upon first successful aggregation
+ * COUNT_VALID, COUNT_ALL:
+ *  - `source`: Skipped
+ *  - `target`: Cannot be null
+ *
+ * @param target Table containing the row to update
+ * @param target_index Index of the row to update in `target`
+ * @param source Table containing the row used to update the row in `target`.
+ * The invariant `source.num_columns() >= target.num_columns()` must hold.
+ * @param source_index Index of the row to use in `source`
+ * @param aggs Array of aggregations to perform between elements of the `target`
+ * and `source` rows. Must contain at least `target.num_columns()` valid
+ * `aggregation::Kind` values.
+ */
+__device__ inline void aggregate_row(mutable_table_device_view target,
+                                     size_type target_index,
+                                     table_device_view source,
+                                     size_type source_index,
+                                     aggregation::Kind const* aggs)
+{
+  for (auto i = 0; i < target.num_columns(); ++i) {
+    dispatch_type_and_aggregation(source.column(i).type(),
+                                  aggs[i],
+                                  elementwise_aggregator{},
+                                  target.column(i),
+                                  target_index,
+                                  source.column(i),
+                                  source_index);
+  }
+}
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 80a4460023f..4295f5e6ddd 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -143,28 +143,30 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create an empty tdigest column.
+ * @brief Create a tdigest column of empty tdigests.
  *
- * An empty tdigest column contains a single row of length 0
+ * The column created contains the specified number of rows of empty tdigests.
  *
+ * @param num_rows The number of rows in the output column.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns An empty tdigest column.
+ * @returns A tdigest column of empty clusters.
  */
 CUDF_EXPORT
-std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr);
+std::unique_ptr<column> make_empty_tdigests_column(size_type num_rows,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create an empty tdigest scalar.
+ * @brief Create a scalar of an empty tdigest cluster.
  *
- * An empty tdigest scalar is a struct_scalar that contains a single row of length 0
+ * The returned scalar is a struct_scalar that contains a single row of an empty cluster.
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns An empty tdigest scalar.
+ * @returns A scalar of an empty tdigest cluster.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index 1758790cd64..c259d61060b 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -562,12 +562,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto b = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto b = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index 02998b84ffd..d915c85bf85 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,13 @@
  */
 
 #include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <vector>
+
 namespace cudf {
 namespace detail {
 void initialize_with_identity(mutable_table_view& table,
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh
index 9abfe22950a..188d0cff3f1 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/groupby_kernels.cuh
@@ -18,8 +18,8 @@
 
 #include "multi_pass_kernels.cuh"
 
-#include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/device_aggregators.cuh>
 #include <cudf/groupby.hpp>
 #include <cudf/utilities/bit.hpp>
 
@@ -100,7 +100,7 @@ struct compute_single_pass_aggs_fn {
     if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) {
       auto const result = set.insert_and_find(i);
 
-      cudf::detail::aggregate_row<true, true>(output_values, *result.first, input_values, i, aggs);
+      cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs);
     }
   }
 };
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 2358f47bbbb..f9adfc6060e 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -25,6 +25,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index bf81162a0ac..76816071d8c 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -620,10 +620,12 @@ struct PdaSymbolToSymbolGroupId {
     // We map the delimiter character to LINE_BREAK symbol group id, and the newline character
     // to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
     // escape, comma, colon or whitespace characters.
+    auto constexpr newline    = '\n';
+    auto constexpr whitespace = ' ';
     auto const symbol_position =
       symbol == delimiter
-        ? static_cast<int32_t>('\n')
-        : (symbol == '\n' ? static_cast<int32_t>(' ') : static_cast<int32_t>(symbol));
+        ? static_cast<int32_t>(newline)
+        : (symbol == newline ? static_cast<int32_t>(whitespace) : static_cast<int32_t>(symbol));
     PdaSymbolGroupIdT symbol_gid =
       tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)];
     return stack_idx * static_cast<PdaSymbolGroupIdT>(symbol_group_id::NUM_PDA_INPUT_SGS) +
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 0d017cf1f13..43c3b0a291b 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -292,32 +292,33 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
   return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr);
 }
 
-std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr)
+std::unique_ptr<column> make_empty_tdigests_column(size_type num_rows,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
 {
   auto offsets = cudf::make_fixed_width_column(
-    data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                offsets->mutable_view().begin<size_type>(),
                offsets->mutable_view().end<size_type>(),
                0);
 
-  auto min_col =
-    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  auto min_col = cudf::make_numeric_column(
+    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                min_col->mutable_view().begin<double>(),
                min_col->mutable_view().end<double>(),
                0);
-  auto max_col =
-    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  auto max_col = cudf::make_numeric_column(
+    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                max_col->mutable_view().begin<double>(),
                max_col->mutable_view().end<double>(),
                0);
 
-  return make_tdigest_column(1,
-                             make_empty_column(type_id::FLOAT64),
-                             make_empty_column(type_id::FLOAT64),
+  return make_tdigest_column(num_rows,
+                             cudf::make_empty_column(type_id::FLOAT64),
+                             cudf::make_empty_column(type_id::FLOAT64),
                              std::move(offsets),
                              std::move(min_col),
                              std::move(max_col),
@@ -338,7 +339,7 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr)
 {
-  auto contents = make_empty_tdigest_column(stream, mr)->release();
+  auto contents = make_empty_tdigests_column(1, stream, mr)->release();
   return std::make_unique<struct_scalar>(
     std::move(*std::make_unique<table>(std::move(contents.children))), true, stream, mr);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index e1c1d2e3002..b0a84a6d50c 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -169,19 +169,19 @@ struct nearest_value_scalar_weights {
  */
 template <typename GroupOffsetsIter>
 struct nearest_value_centroid_weights {
-  double const* cumulative_weights;
-  GroupOffsetsIter outer_offsets;  // groups
-  size_type const* inner_offsets;  // tdigests within a group
+  double const* cumulative_weights;  // cumulative weights of non-empty clusters
+  GroupOffsetsIter group_offsets;    // groups
+  size_type const* tdigest_offsets;  // tdigests within a group
 
   thrust::pair<double, int> operator() __device__(double next_limit, size_type group_index) const
   {
-    auto const tdigest_begin = outer_offsets[group_index];
-    auto const tdigest_end   = outer_offsets[group_index + 1];
-    auto const num_weights   = inner_offsets[tdigest_end] - inner_offsets[tdigest_begin];
+    auto const tdigest_begin = group_offsets[group_index];
+    auto const tdigest_end   = group_offsets[group_index + 1];
+    auto const num_weights   = tdigest_offsets[tdigest_end] - tdigest_offsets[tdigest_begin];
     // NOTE: as it is today, this functor will never be called for any digests that are empty, but
     // I'll leave this check here for safety.
     if (num_weights == 0) { return thrust::pair<double, int>{0, 0}; }
-    double const* group_cumulative_weights = cumulative_weights + inner_offsets[tdigest_begin];
+    double const* group_cumulative_weights = cumulative_weights + tdigest_offsets[tdigest_begin];
 
     auto const index = ((thrust::lower_bound(thrust::seq,
                                              group_cumulative_weights,
@@ -235,21 +235,26 @@ struct cumulative_scalar_weight {
  */
 template <typename GroupLabelsIter, typename GroupOffsetsIter>
 struct cumulative_centroid_weight {
-  double const* cumulative_weights;
-  GroupLabelsIter group_labels;
-  GroupOffsetsIter outer_offsets;                    // groups
-  cudf::device_span<size_type const> inner_offsets;  // tdigests with a group
-
+  double const* cumulative_weights;  // cumulative weights of non-empty clusters
+  GroupLabelsIter group_labels;      // group labels for each tdigest including empty ones
+  GroupOffsetsIter group_offsets;    // groups
+  cudf::device_span<size_type const> tdigest_offsets;  // tdigests with a group
+
+  /**
+   * @brief Returns the cumulative weight for a given value index. The index `n` is the index of
+   * `n`-th non-empty cluster.
+   */
   std::tuple<size_type, size_type, double> operator() __device__(size_type value_index) const
   {
     auto const tdigest_index =
       static_cast<size_type>(
-        thrust::upper_bound(thrust::seq, inner_offsets.begin(), inner_offsets.end(), value_index) -
-        inner_offsets.begin()) -
+        thrust::upper_bound(
+          thrust::seq, tdigest_offsets.begin(), tdigest_offsets.end(), value_index) -
+        tdigest_offsets.begin()) -
       1;
     auto const group_index                 = group_labels[tdigest_index];
-    auto const first_tdigest_index         = outer_offsets[group_index];
-    auto const first_weight_index          = inner_offsets[first_tdigest_index];
+    auto const first_tdigest_index         = group_offsets[group_index];
+    auto const first_weight_index          = tdigest_offsets[first_tdigest_index];
     auto const relative_value_index        = value_index - first_weight_index;
     double const* group_cumulative_weights = cumulative_weights + first_weight_index;
 
@@ -284,15 +289,15 @@ struct scalar_group_info {
 // retrieve group info of centroid inputs by group index
 template <typename GroupOffsetsIter>
 struct centroid_group_info {
-  double const* cumulative_weights;
-  GroupOffsetsIter outer_offsets;
-  size_type const* inner_offsets;
+  double const* cumulative_weights;  // cumulative weights of non-empty clusters
+  GroupOffsetsIter group_offsets;
+  size_type const* tdigest_offsets;
 
   __device__ thrust::tuple<double, size_type, size_type> operator()(size_type group_index) const
   {
     // if there's no weights in this group of digests at all, return 0.
-    auto const group_start       = inner_offsets[outer_offsets[group_index]];
-    auto const group_end         = inner_offsets[outer_offsets[group_index + 1]];
+    auto const group_start       = tdigest_offsets[group_offsets[group_index]];
+    auto const group_end         = tdigest_offsets[group_offsets[group_index + 1]];
     auto const num_weights       = group_end - group_start;
     auto const last_weight_index = group_end - 1;
     return num_weights == 0
@@ -367,7 +372,6 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
  * @param group_num_clusters  Output.  The number of output clusters for each input group.
  * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
  * @param has_nulls Whether or not the input contains nulls
- *
  */
 
 template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
@@ -661,6 +665,10 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                                     mr);
 }
 
+/**
+ * @brief A functor which returns the cluster index within a group that the value at
+ * the given value index falls into.
+ */
 template <typename CumulativeWeight>
 struct compute_tdigests_keys_fn {
   int const delta;
@@ -706,8 +714,8 @@ struct compute_tdigests_keys_fn {
  * boundaries.
  *
  * @param delta              tdigest compression level
- * @param values_begin       Beginning of the range of input values.
- * @param values_end         End of the range of input values.
+ * @param centroids_begin    Beginning of the range of centroids.
+ * @param centroids_end      End of the range of centroids.
  * @param cumulative_weight  Functor which returns cumulative weight and group information for
  * an absolute input value index.
  * @param min_col            Column containing the minimum value per group.
@@ -750,7 +758,9 @@ std::unique_ptr<column> compute_tdigests(int delta,
   //   double       // max
   // }
   //
-  if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
+  if (total_clusters == 0) {
+    return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr);
+  }
 
   // each input group represents an individual tdigest.  within each tdigest, we want the keys
   // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
@@ -983,38 +993,54 @@ struct typed_reduce_tdigest {
   }
 };
 
-// utility for merge_tdigests.
+/**
+ * @brief Functor to compute the number of clusters in each group.
+ *
+ * Used in `merge_tdigests`.
+ */
 template <typename GroupOffsetsIter>
-struct group_num_weights_func {
-  GroupOffsetsIter outer_offsets;
-  size_type const* inner_offsets;
+struct group_num_clusters_func {
+  GroupOffsetsIter group_offsets;
+  size_type const* tdigest_offsets;
 
   __device__ size_type operator()(size_type group_index)
   {
-    auto const tdigest_begin = outer_offsets[group_index];
-    auto const tdigest_end   = outer_offsets[group_index + 1];
-    return inner_offsets[tdigest_end] - inner_offsets[tdigest_begin];
+    auto const tdigest_begin = group_offsets[group_index];
+    auto const tdigest_end   = group_offsets[group_index + 1];
+    return tdigest_offsets[tdigest_end] - tdigest_offsets[tdigest_begin];
   }
 };
 
-// utility for merge_tdigests.
+/**
+ * @brief Function to determine if a group is empty.
+ *
+ * Used in `merge_tdigests`.
+ */
 struct group_is_empty {
   __device__ bool operator()(size_type group_size) { return group_size == 0; }
 };
 
-// utility for merge_tdigests.
+/**
+ * @brief Functor that returns the grouping key for each tdigest cluster.
+ *
+ * Used in `merge_tdigests`.
+ */
 template <typename GroupLabelsIter>
 struct group_key_func {
   GroupLabelsIter group_labels;
-  size_type const* inner_offsets;
-  size_type num_inner_offsets;
+  size_type const* tdigest_offsets;
+  size_type num_tdigest_offsets;
 
+  /**
+   * @brief Returns the group index for an absolute cluster index. The index `n` is the index of the
+   * `n`-th non-empty cluster.
+   */
   __device__ size_type operator()(size_type index)
   {
     // what -original- tdigest index this absolute index corresponds to
-    auto const iter = thrust::prev(
-      thrust::upper_bound(thrust::seq, inner_offsets, inner_offsets + num_inner_offsets, index));
-    auto const tdigest_index = thrust::distance(inner_offsets, iter);
+    auto const iter          = thrust::prev(thrust::upper_bound(
+      thrust::seq, tdigest_offsets, tdigest_offsets + num_tdigest_offsets, index));
+    auto const tdigest_index = thrust::distance(tdigest_offsets, iter);
 
     // what group index the original tdigest belongs to
     return group_labels[tdigest_index];
@@ -1040,8 +1066,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
 
   // each group represents a collection of tdigest columns. each row is 1 tdigest.
   // within each group, we want to sort all the centroids within all the tdigests
-  // in that group, using the means as the key. the "outer offsets" represent the indices of the
-  // tdigests, and the "inner offsets" represents the list of centroids for a particular tdigest.
+  // in that group, using the means as the key. the "group offsets" represent the indices of the
+  // tdigests, and the "tdigest offsets" represents the list of centroids for a particular tdigest.
   //
   //  rows
   //  ----        centroid 0 ---------
@@ -1054,12 +1080,12 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
   //  tdigest 3   centroid 7
   //              centroid 8
   //  ----        centroid 9 --------
-  auto inner_offsets    = tdv.centroids().offsets();
+  auto tdigest_offsets  = tdv.centroids().offsets();
   auto centroid_offsets = cudf::detail::make_counting_transform_iterator(
     0,
     cuda::proclaim_return_type<size_type>(
-      [group_offsets, inner_offsets = tdv.centroids().offsets().begin<size_type>()] __device__(
-        size_type i) { return inner_offsets[group_offsets[i]]; }));
+      [group_offsets, tdigest_offsets = tdv.centroids().offsets().begin<size_type>()] __device__(
+        size_type i) { return tdigest_offsets[group_offsets[i]]; }));
 
   // perform the sort using the means as the key
   size_t temp_size;
@@ -1091,9 +1117,34 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
   return {std::move(output_means), std::move(output_weights)};
 }
 
+/**
+ * @brief Perform a merge aggregation of tdigests. This function usually takes the input as the
+ * outputs of multiple `typed_group_tdigest` calls, and merges them.
+ *
+ * A tdigest can be empty in the input, which means that there was no valid input data to generate
+ * it. These empty tdigests will have no centroids (means or weights) and will have a `min` and
+ * `max` of 0.
+ *
+ * @param tdv input tdigests. The tdigests within this column are grouped by key.
+ * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is
+ * counted as one even when the cluster is empty in it. The offsets should have the same values as
+ * the ones in `group_offsets`.
+ * @param group_offsets a device iterator of the offsets to the start of each group. A group is
+ * counted as one even when the cluster is empty in it. The offsets should have the same values as
+ * the ones in `h_group_offsets`.
+ * @param group_labels a device iterator of the the group label for each tdigest cluster including
+ * empty clusters.
+ * @param num_group_labels the number of unique group labels.
+ * @param num_groups the number of groups.
+ * @param max_centroids the maximum number of centroids (clusters) in the output (merged) tdigest.
+ * @param stream CUDA stream
+ * @param mr device memory resource
+ *
+ * @return A column containing the merged tdigests.
+ */
 template <typename HGroupOffsetIter, typename GroupOffsetIter, typename GroupLabelIter>
 std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
-                                       HGroupOffsetIter h_outer_offsets,
+                                       HGroupOffsetIter h_group_offsets,
                                        GroupOffsetIter group_offsets,
                                        GroupLabelIter group_labels,
                                        size_t num_group_labels,
@@ -1133,22 +1184,24 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                         thrust::equal_to{},  // key equality check
                         thrust::maximum{});
 
+  auto tdigest_offsets = tdv.centroids().offsets();
+
   // for any empty groups, set the min and max to be 0. not technically necessary but it makes
   // testing simpler.
-  auto group_num_weights = cudf::detail::make_counting_transform_iterator(
+  auto group_num_clusters = cudf::detail::make_counting_transform_iterator(
     0,
-    group_num_weights_func<decltype(group_offsets)>{group_offsets,
-                                                    tdv.centroids().offsets().begin<size_type>()});
+    group_num_clusters_func<decltype(group_offsets)>{group_offsets,
+                                                     tdigest_offsets.begin<size_type>()});
   thrust::replace_if(rmm::exec_policy(stream),
                      merged_min_col->mutable_view().begin<double>(),
                      merged_min_col->mutable_view().end<double>(),
-                     group_num_weights,
+                     group_num_clusters,
                      group_is_empty{},
                      0);
   thrust::replace_if(rmm::exec_policy(stream),
                      merged_max_col->mutable_view().begin<double>(),
                      merged_max_col->mutable_view().end<double>(),
-                     group_num_weights,
+                     group_num_clusters,
                      group_is_empty{},
                      0);
 
@@ -1166,14 +1219,13 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   // generate group keys for all centroids in the entire column
   rmm::device_uvector<size_type> group_keys(num_centroids, stream, temp_mr);
-  auto iter          = thrust::make_counting_iterator(0);
-  auto inner_offsets = tdv.centroids().offsets();
+  auto iter = thrust::make_counting_iterator(0);
   thrust::transform(rmm::exec_policy(stream),
                     iter,
                     iter + num_centroids,
                     group_keys.begin(),
                     group_key_func<decltype(group_labels)>{
-                      group_labels, inner_offsets.begin<size_type>(), inner_offsets.size()});
+                      group_labels, tdigest_offsets.begin<size_type>(), tdigest_offsets.size()});
   thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
                                 group_keys.begin(),
                                 group_keys.begin() + num_centroids,
@@ -1182,20 +1234,24 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   auto const delta = max_centroids;
 
+  // TDigest merge takes the output of typed_group_tdigest as its input, which must not have
+  // any nulls.
+  auto const has_nulls = false;
+
   // generate cluster info
   auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
     delta,
     num_groups,
     nearest_value_centroid_weights<decltype(group_offsets)>{
-      cumulative_weights.begin(), group_offsets, inner_offsets.begin<size_type>()},
+      cumulative_weights.begin(), group_offsets, tdigest_offsets.begin<size_type>()},
     centroid_group_info<decltype(group_offsets)>{
-      cumulative_weights.begin(), group_offsets, inner_offsets.begin<size_type>()},
+      cumulative_weights.begin(), group_offsets, tdigest_offsets.begin<size_type>()},
     cumulative_centroid_weight<decltype(group_labels), decltype(group_offsets)>{
       cumulative_weights.begin(),
       group_labels,
       group_offsets,
-      {inner_offsets.begin<size_type>(), static_cast<size_t>(inner_offsets.size())}},
-    false,
+      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+    has_nulls,
     stream,
     mr);
 
@@ -1212,13 +1268,13 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       cumulative_weights.begin(),
       group_labels,
       group_offsets,
-      {inner_offsets.begin<size_type>(), static_cast<size_t>(inner_offsets.size())}},
+      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
     std::move(merged_min_col),
     std::move(merged_max_col),
     group_cluster_wl,
     std::move(group_cluster_offsets),
     total_clusters,
-    false,
+    has_nulls,
     stream,
     mr);
 }
@@ -1283,7 +1339,7 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
+  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr); }
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(col.type(),
@@ -1309,7 +1365,15 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   tdigest_column_view tdv(input);
 
   if (num_groups == 0 || input.size() == 0) {
-    return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr);
+    return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr);
+  }
+
+  if (tdv.means().size() == 0) {
+    // `group_merge_tdigest` takes the output of `typed_group_tdigest` as its input, which wipes
+    // out the means and weights for empty clusters. Thus, no mean here indicates that all clusters
+    // are empty in the input. Let's skip all complex computation in the below, but just return
+    // an empty tdigest per group.
+    return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr);
   }
 
   // bring group offsets back to the host
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index baa59026b07..4ae5d06b214 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto e = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
@@ -507,3 +507,126 @@ TEST_F(TDigestMergeTest, EmptyGroups)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
 }
+
+std::unique_ptr<cudf::table> do_agg(
+  cudf::column_view key,
+  cudf::column_view val,
+  std::function<std::unique_ptr<cudf::groupby_aggregation>()> make_agg)
+{
+  std::vector<cudf::column_view> keys;
+  keys.push_back(key);
+  cudf::table_view const key_table(keys);
+
+  cudf::groupby::groupby gb(key_table);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  cudf::groupby::aggregation_request req;
+  req.values = val;
+  req.aggregations.push_back(make_agg());
+  requests.push_back(std::move(req));
+
+  auto result = gb.aggregate(std::move(requests));
+
+  std::vector<std::unique_ptr<cudf::column>> result_columns;
+  for (auto&& c : result.first->release()) {
+    result_columns.push_back(std::move(c));
+  }
+
+  EXPECT_EQ(result.second.size(), 1);
+  EXPECT_EQ(result.second[0].results.size(), 1);
+  result_columns.push_back(std::move(result.second[0].results[0]));
+
+  return std::make_unique<cudf::table>(std::move(result_columns));
+}
+
+TEST_F(TDigestMergeTest, AllValuesAreNull)
+{
+  // The input must be sorted by the key.
+  // See `aggregate_result_functor::operator()<aggregation::TDIGEST>` for details.
+  auto const keys      = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 0, 1, 1, 2}};
+  auto const keys_view = cudf::column_view(keys);
+  auto val_elems  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+    // All values are null
+    return false;
+  });
+  auto const vals = cudf::test::fixed_width_column_wrapper<int32_t>{
+    val_elems, val_elems + keys_view.size(), val_valids};
+
+  auto const delta = 1000;
+
+  // Compute tdigest. The result should have 3 empty clusters, one per group.
+  auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() {
+    return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+  });
+
+  auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
+  cudf::column_view const expected_computed_keys_view{expected_computed_keys};
+  auto const expected_computed_vals =
+    cudf::tdigest::detail::make_empty_tdigests_column(expected_computed_keys_view.size(),
+                                                      cudf::get_default_stream(),
+                                                      rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view());
+  // The computed values are nullable even though the input values are not.
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(),
+                                 compute_result->get_column(1).view());
+
+  // Merge tdigest. The result should have 3 empty clusters, one per group.
+  auto const merge_result =
+    do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
+      return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+    });
+
+  auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
+  cudf::column_view const expected_merged_keys_view{expected_merged_keys};
+  auto const expected_merged_vals =
+    cudf::tdigest::detail::make_empty_tdigests_column(expected_merged_keys_view.size(),
+                                                      cudf::get_default_stream(),
+                                                      rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view());
+}
+
+TEST_F(TDigestMergeTest, AllValuesInOneGroupIsNull)
+{
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 1, 2, 2, 3};
+  cudf::test::fixed_width_column_wrapper<double> vals{{10.0, 20.0, {}, {}, 30.0},
+                                                      {true, true, false, false, true}};
+
+  auto const delta = 1000;
+
+  // Compute tdigest. The result should have 3 empty clusters, one per group.
+  auto const compute_result = do_agg(cudf::column_view(keys), cudf::column_view(vals), [&delta]() {
+    return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+  });
+
+  auto const expected_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2, 3}};
+
+  cudf::test::fixed_width_column_wrapper<double> expected_means{10, 20, 30};
+  cudf::test::fixed_width_column_wrapper<double> expected_weights{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_offsets{0, 1, 2, 2, 3};
+  cudf::test::fixed_width_column_wrapper<double> expected_mins{10.0, 20.0, 0.0, 30.0};
+  cudf::test::fixed_width_column_wrapper<double> expected_maxes{10.0, 20.0, 0.0, 30.0};
+  auto const expected_values =
+    cudf::tdigest::detail::make_tdigest_column(4,
+                                               std::make_unique<cudf::column>(expected_means),
+                                               std::make_unique<cudf::column>(expected_weights),
+                                               std::make_unique<cudf::column>(expected_offsets),
+                                               std::make_unique<cudf::column>(expected_mins),
+                                               std::make_unique<cudf::column>(expected_maxes),
+                                               cudf::get_default_stream(),
+                                               rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys},
+                                 compute_result->get_column(0).view());
+  // The computed values are nullable even though the input values are not.
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), compute_result->get_column(1).view());
+
+  // Merge tdigest. The result should have 3 empty clusters, one per group.
+  auto const merge_result =
+    do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
+      return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+    });
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys},
+                                 merge_result->get_column(0).view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), merge_result->get_column(1).view());
+}
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 915717713df..37414eb3fba 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {};
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;