rapidsai · kkraus14 · Nov 13, 2020 · Oct 29, 2020 · Nov 2, 2020 · Nov 2, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -44,6 +44,7 @@
 - PR #6514 Initial work for decimal type in Java/JNI
 - PR #6608 Improve subword tokenizer docs
 - PR #6612 Update JNI to new RMM cuda_stream_view API
+- PR #6646 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 1)
 - PR #6579 Update scatter APIs to use reference wrapper / const scalar
 - PR #6614 Add support for conversion to Pandas nullable dtypes and fix related issue in `cudf.to_json`
 - PR #6622 Update `to_pandas` api docs

@@ -24,6 +24,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <future>
@@ -296,9 +297,9 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
   return std::make_unique<cudf::column>(
     cudf::data_type{cudf::type_to_id<T>()},
     num_rows,
-    rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), cudaStream_t(0)),
+    rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), rmm::cuda_stream_default),
     rmm::device_buffer(
-      null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0)));
+      null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
 }
 
 /**
@@ -483,15 +484,16 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
     auto offsets_column = std::make_unique<cudf::column>(
       cudf::data_type{cudf::type_id::INT32},
       offsets.size(),
-      rmm::device_buffer(offsets.data(), offsets.size() * sizeof(int32_t), cudaStream_t(0)));
+      rmm::device_buffer(
+        offsets.data(), offsets.size() * sizeof(int32_t), rmm::cuda_stream_default));
 
     list_column = cudf::make_lists_column(
       num_rows,
       std::move(offsets_column),
       std::move(current_child_column),
       cudf::UNKNOWN_NULL_COUNT,
       rmm::device_buffer(
-        null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0)));
+        null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
   }
   return list_column;  // return the top-level column
 }

@@ -14,17 +14,8 @@
 
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
-  cudaStream_t stream                 = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto s = new ScalarType(0, false, stream, mr);
-  return std::unique_ptr<cudf::scalar>(s);
-}
-
-template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
-std::unique_ptr<cudf::scalar> make_scalar(
-  T value,
-  cudaStream_t stream                 = 0,
+  T value                             = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(value, true, stream, mr);

@@ -31,7 +31,7 @@ void BM_setnullmask(benchmark::State& state)
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), begin, end, true, 0);
+    cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), begin, end, true);
   }
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * size / 8);
@@ -44,4 +44,4 @@ void BM_setnullmask(benchmark::State& state)
     ->Range(1 << 10, 1 << 30)                                                                  \
     ->UseManualTime();
 
-NBM_BENCHMARK_DEFINE(SetNullMaskKernel);
+NBM_BENCHMARK_DEFINE(SetNullMaskKernel);
@@ -90,13 +90,13 @@ struct ColumnHandle {
   template <typename ColumnType>
   void operator()(mutable_column_device_view source_column,
                   int work_per_thread,
-                  cudaStream_t stream = 0)
+                  rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     cudf::detail::grid_1d grid_config{source_column.size(), block_size};
     int grid_size = grid_config.num_blocks;
     // Launch the kernel.
     host_dispatching_kernel<functor_type, ColumnType>
-      <<<grid_size, block_size, 0, stream>>>(source_column);
+      <<<grid_size, block_size, 0, stream.value()>>>(source_column);
   }
 };
 
@@ -160,7 +160,7 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
 }
 
 template <class TypeParam, FunctorType functor_type, DispatchingType dispatching_type>
-void type_dispatcher_benchmark(benchmark::State& state)
+void type_dispatcher_benchmark(::benchmark::State& state)
 {
   const cudf::size_type source_size = static_cast<cudf::size_type>(state.range(1));
 

@@ -225,7 +225,7 @@ You can use the `@copydoc` tag to avoid duplicating the comment block for a func
    */
 ```
 
-Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `cudaStream_t` parameter.
+Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `stream` parameter.
 
 ```c++
 /**
@@ -235,7 +235,7 @@ Also, `@copydoc` is useful when documenting a `detail` function that differs onl
  */
 std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
                                                 std::vector<size_type> const& indices,
-                                                cudaStream_t stream = 0);
+                                                rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 ```
 
 Note, you must specify the whole signature of the function, including optional parameters, so that doxygen will be able to locate it.

@@ -131,7 +131,7 @@ A *mutable*, non-owning view of a table.
 We do not yet expose CUDA streams in external libcudf APIs. 
 However, in order to ease the transition to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be implemented using asynchronous APIs on the default stream (e.g., stream 0). 
 
-The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `cudaStream_t` parameter at the end defaulted to `0`. 
+The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `rmm::cuda_stream_view` parameter at the end defaulted to `rmm::cuda_stream_default`. 
 The implementation should be wholly contained in the `detail` API definition and use only asynchronous versions of CUDA APIs with the defaulted stream parameter. 
 
 In order to make the `detail` API callable from other libcudf functions, it should be exposed in a header placed in the `cudf/cpp/include/detail/` directory.
@@ -144,19 +144,19 @@ void external_function(...);
 
 // cpp/include/cudf/detail/header.hpp
 namespace detail{
-void external_function(..., cudaStream_t stream = 0)
+void external_function(..., rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 } // namespace detail
 
 // cudf/src/implementation.cpp
 namespace detail{
     // defaulted stream parameter
-    void external_function(..., cudaStream_t stream){
+    void external_function(..., rmm::cuda_stream_view stream){
         // implementation uses stream w/ async APIs
         RMM_ALLOC(...,stream);
-        CUDA_TRY(cudaMemcpyAsync(...,stream));
+        CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
         kernel<<<..., stream>>>(...);
         thrust::algorithm(rmm::exec_policy(stream)->on(stream), ...);
-        CUDA_TRY(cudaStreamSynchronize(stream));
+        stream.synchronize();
         RMM_FREE(...,stream);
     }
 } // namespace detail

@@ -27,6 +27,7 @@
 
 #include <cstring>
 #include <numeric>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 
@@ -369,7 +370,7 @@ struct ast_plan {
 std::unique_ptr<column> compute_column(
   table_view const table,
   expression const& expr,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace detail
 

@@ -15,10 +15,12 @@
  */
 #pragma once
 
+#include "column_view.hpp"
+
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
-#include "column_view.hpp"
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <memory>
@@ -50,9 +52,6 @@ class column {
   /**
    * @brief Construct a new column by deep copying the contents of `other`.
    *
-   * All device memory allocation and copying is done using the
-   * `device_memory_resource` and `stream` from `other`.
-   *
    * @param other The column to copy
    **/
   column(column const& other);
@@ -69,7 +68,7 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   column(column const& other,
-         cudaStream_t stream,
+         rmm::cuda_stream_view stream,
          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -124,7 +123,7 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   explicit column(column_view view,
-                  cudaStream_t stream                 = 0,
+                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**

@@ -28,6 +28,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 /**
  * @file column_device_view.cuh
  * @brief Column device view class definitons
@@ -386,7 +388,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    *`source_view` available in device memory.
    */
   static std::unique_ptr<column_device_view, std::function<void(column_device_view*)>> create(
-    column_view source_view, cudaStream_t stream = 0);
+    column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Destroy the `column_device_view` object.
@@ -480,7 +482,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    */
   static std::unique_ptr<mutable_column_device_view,
                          std::function<void(mutable_column_device_view*)>>
-  create(mutable_column_view source_view, cudaStream_t stream = 0);
+  create(mutable_column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Returns pointer to the base device memory allocation casted to

@@ -286,6 +286,46 @@ std::unique_ptr<column> copy_range(
   size_type target_begin,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Creates a new column by shifting all values by an offset.
+ *
+ * @ingroup copy_shift
+ *
+ * Elements will be determined by `output[idx] = input[idx - offset]`.
+ * Some elements in the output may be indeterminable from the input. For those
+ * elements, the value will be determined by `fill_values`.
+ *
+ * @code{.pseudo}
+ * Examples
+ * -------------------------------------------------
+ * input       = [0, 1, 2, 3, 4]
+ * offset      = 3
+ * fill_values = @
+ * return      = [@, @, @, 0, 1]
+ * -------------------------------------------------
+ * input       = [5, 4, 3, 2, 1]
+ * offset      = -2
+ * fill_values = 7
+ * return      = [3, 2, 1, 7, 7]
+ * @endcode
+ *
+ * @note if the input is nullable, the output will be nullable.
+ * @note if the fill value is null, the output will be nullable.
+ *
+ * @param input      Column to be shifted.
+ * @param offset     The offset by which to shift the input.
+ * @param fill_value Fill value for indeterminable outputs.
+ * @param mr         Device memory resource used to allocate the returned result's device memory
+ *
+ * @throw cudf::logic_error if @p input dtype is not fixed-with.
+ * @throw cudf::logic_error if @p fill_value dtype does not match @p input dtype.
+ */
+std::unique_ptr<column> shift(
+  column_view const& input,
+  size_type offset,
+  scalar const& fill_value,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Slices a `column_view` into a set of `column_view`s according to a set of indices.
  *
@@ -479,7 +519,6 @@ struct contiguous_split_result {
  * @param input View of a table to split
  * @param splits A vector of indices where the view will be split
  * @param[in] mr Device memory resource used to allocate the returned result's device memory
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @return The set of requested views of `input` indicated by the `splits` and the viewed memory
  * buffer.
  */
@@ -513,46 +552,6 @@ std::unique_ptr<column> copy_if_else(
   column_view const& boolean_mask,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Creates a new column by shifting all values by an offset.
- *
- * @ingroup copy_shift
- *
- * Elements will be determined by `output[idx] = input[idx - offset]`.
- * Some elements in the output may be indeterminable from the input. For those
- * elements, the value will be determined by `fill_values`.
- *
- * @code{.pseudo}
- * Examples
- * -------------------------------------------------
- * input       = [0, 1, 2, 3, 4]
- * offset      = 3
- * fill_values = @
- * return      = [@, @, @, 0, 1]
- * -------------------------------------------------
- * input       = [5, 4, 3, 2, 1]
- * offset      = -2
- * fill_values = 7
- * return      = [3, 2, 1, 7, 7]
- * @endcode
- *
- * @note if the input is nullable, the output will be nullable.
- * @note if the fill value is null, the output will be nullable.
- *
- * @param input      Column to be shifted.
- * @param offset     The offset by which to shift the input.
- * @param fill_value Fill value for indeterminable outputs.
- *
- * @throw cudf::logic_error if @p input dtype is not fixed-with.
- * @throw cudf::logic_error if @p fill_value dtype does not match @p input dtype.
- */
-std::unique_ptr<column> shift(
-  column_view const& input,
-  size_type offset,
-  scalar const& fill_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
-
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
  *          @p rhs based on the value of the corresponding element in @p boolean_mask