Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace raw streams with rmm::cuda_stream_view (part 1) #6646

Merged
merged 21 commits into from
Nov 13, 2020
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
- PR #6514 Initial work for decimal type in Java/JNI
- PR #6608 Improve subword tokenizer docs
- PR #6612 Update JNI to new RMM cuda_stream_view API
- PR #6646 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 1)
- PR #6579 Update scatter APIs to use reference wrapper / const scalar
- PR #6614 Add support for conversion to Pandas nullable dtypes and fix related issue in `cudf.to_json`
- PR #6622 Update `to_pandas` api docs
Expand Down
10 changes: 6 additions & 4 deletions cpp/benchmarks/common/generate_benchmark_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>

#include <future>
Expand Down Expand Up @@ -296,9 +297,9 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
return std::make_unique<cudf::column>(
cudf::data_type{cudf::type_to_id<T>()},
num_rows,
rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), cudaStream_t(0)),
rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), rmm::cuda_stream_default),
rmm::device_buffer(
null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0)));
null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
}

/**
Expand Down Expand Up @@ -483,15 +484,16 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
auto offsets_column = std::make_unique<cudf::column>(
cudf::data_type{cudf::type_id::INT32},
offsets.size(),
rmm::device_buffer(offsets.data(), offsets.size() * sizeof(int32_t), cudaStream_t(0)));
rmm::device_buffer(
offsets.data(), offsets.size() * sizeof(int32_t), rmm::cuda_stream_default));

list_column = cudf::make_lists_column(
num_rows,
std::move(offsets_column),
std::move(current_child_column),
cudf::UNKNOWN_NULL_COUNT,
rmm::device_buffer(
null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0)));
null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
}
return list_column; // return the top-level column
}
Expand Down
13 changes: 2 additions & 11 deletions cpp/benchmarks/copying/shift_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,8 @@

template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
std::unique_ptr<cudf::scalar> make_scalar(
harrism marked this conversation as resolved.
Show resolved Hide resolved
cudaStream_t stream = 0,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto s = new ScalarType(0, false, stream, mr);
return std::unique_ptr<cudf::scalar>(s);
}

template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
std::unique_ptr<cudf::scalar> make_scalar(
T value,
cudaStream_t stream = 0,
T value = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto s = new ScalarType(value, true, stream, mr);
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ void BM_setnullmask(benchmark::State& state)

for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), begin, end, true, 0);
cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), begin, end, true);
}

state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * size / 8);
Expand All @@ -44,4 +44,4 @@ void BM_setnullmask(benchmark::State& state)
->Range(1 << 10, 1 << 30) \
->UseManualTime();

NBM_BENCHMARK_DEFINE(SetNullMaskKernel);
NBM_BENCHMARK_DEFINE(SetNullMaskKernel);
6 changes: 3 additions & 3 deletions cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ struct ColumnHandle {
template <typename ColumnType>
void operator()(mutable_column_device_view source_column,
int work_per_thread,
cudaStream_t stream = 0)
rmm::cuda_stream_view stream = rmm::cuda_stream_default)
{
cudf::detail::grid_1d grid_config{source_column.size(), block_size};
int grid_size = grid_config.num_blocks;
// Launch the kernel.
host_dispatching_kernel<functor_type, ColumnType>
<<<grid_size, block_size, 0, stream>>>(source_column);
<<<grid_size, block_size, 0, stream.value()>>>(source_column);
}
};

Expand Down Expand Up @@ -160,7 +160,7 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
}

template <class TypeParam, FunctorType functor_type, DispatchingType dispatching_type>
void type_dispatcher_benchmark(benchmark::State& state)
void type_dispatcher_benchmark(::benchmark::State& state)
{
const cudf::size_type source_size = static_cast<cudf::size_type>(state.range(1));

Expand Down
4 changes: 2 additions & 2 deletions cpp/docs/DOCUMENTATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ You can use the `@copydoc` tag to avoid duplicating the comment block for a func
*/
```

Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `cudaStream_t` parameter.
Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `stream` parameter.

```c++
/**
Expand All @@ -235,7 +235,7 @@ Also, `@copydoc` is useful when documenting a `detail` function that differs onl
*/
std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
std::vector<size_type> const& indices,
cudaStream_t stream = 0);
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
```

Note, you must specify the whole signature of the function, including optional parameters, so that doxygen will be able to locate it.
Expand Down
10 changes: 5 additions & 5 deletions cpp/docs/TRANSITIONGUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ A *mutable*, non-owning view of a table.
We do not yet expose CUDA streams in external libcudf APIs.
However, in order to ease the transition to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be implemented using asynchronous APIs on the default stream (e.g., stream 0).

The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `cudaStream_t` parameter at the end defaulted to `0`.
The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `rmm::cuda_stream_view` parameter at the end defaulted to `rmm::cuda_stream_default`.
The implementation should be wholly contained in the `detail` API definition and use only asynchronous versions of CUDA APIs with the defaulted stream parameter.

In order to make the `detail` API callable from other libcudf functions, it should be exposed in a header placed in the `cudf/cpp/include/detail/` directory.
Expand All @@ -144,19 +144,19 @@ void external_function(...);

// cpp/include/cudf/detail/header.hpp
namespace detail{
void external_function(..., cudaStream_t stream = 0)
void external_function(..., rmm::cuda_stream_view stream = rmm::cuda_stream_default)
} // namespace detail

// cudf/src/implementation.cpp
namespace detail{
// defaulted stream parameter
void external_function(..., cudaStream_t stream){
void external_function(..., rmm::cuda_stream_view stream){
// implementation uses stream w/ async APIs
RMM_ALLOC(...,stream);
CUDA_TRY(cudaMemcpyAsync(...,stream));
CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
kernel<<<..., stream>>>(...);
thrust::algorithm(rmm::exec_policy(stream)->on(stream), ...);
CUDA_TRY(cudaStreamSynchronize(stream));
stream.synchronize();
RMM_FREE(...,stream);
}
} // namespace detail
Expand Down
3 changes: 2 additions & 1 deletion cpp/include/cudf/ast/detail/transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

#include <cstring>
#include <numeric>
#include "rmm/cuda_stream_view.hpp"
harrism marked this conversation as resolved.
Show resolved Hide resolved

namespace cudf {

Expand Down Expand Up @@ -369,7 +370,7 @@ struct ast_plan {
std::unique_ptr<column> compute_column(
table_view const table,
expression const& expr,
cudaStream_t stream = 0,
rmm::cuda_stream_view stream,
harrism marked this conversation as resolved.
Show resolved Hide resolved
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
} // namespace detail

Expand Down
11 changes: 5 additions & 6 deletions cpp/include/cudf/column/column.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@
*/
#pragma once

#include "column_view.hpp"

#include <cudf/null_mask.hpp>
#include <cudf/types.hpp>
#include "column_view.hpp"

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>

#include <memory>
Expand Down Expand Up @@ -50,9 +52,6 @@ class column {
/**
* @brief Construct a new column by deep copying the contents of `other`.
*
* All device memory allocation and copying is done using the
* `device_memory_resource` and `stream` from `other`.
*
* @param other The column to copy
**/
column(column const& other);
Expand All @@ -69,7 +68,7 @@ class column {
* @param mr Device memory resource to use for all device memory allocations
*/
column(column const& other,
cudaStream_t stream,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -124,7 +123,7 @@ class column {
* @param mr Device memory resource to use for all device memory allocations
*/
explicit column(column_view view,
cudaStream_t stream = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down
6 changes: 4 additions & 2 deletions cpp/include/cudf/column/column_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>

/**
* @file column_device_view.cuh
* @brief Column device view class definitons
Expand Down Expand Up @@ -386,7 +388,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
*`source_view` available in device memory.
*/
static std::unique_ptr<column_device_view, std::function<void(column_device_view*)>> create(
column_view source_view, cudaStream_t stream = 0);
column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @brief Destroy the `column_device_view` object.
Expand Down Expand Up @@ -480,7 +482,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
*/
static std::unique_ptr<mutable_column_device_view,
std::function<void(mutable_column_device_view*)>>
create(mutable_column_view source_view, cudaStream_t stream = 0);
create(mutable_column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @brief Returns pointer to the base device memory allocation casted to
Expand Down
81 changes: 40 additions & 41 deletions cpp/include/cudf/copying.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,46 @@ std::unique_ptr<column> copy_range(
size_type target_begin,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Creates a new column by shifting all values by an offset.
*
* @ingroup copy_shift
*
* Elements will be determined by `output[idx] = input[idx - offset]`.
* Some elements in the output may be indeterminable from the input. For those
* elements, the value will be determined by `fill_values`.
*
* @code{.pseudo}
* Examples
* -------------------------------------------------
* input = [0, 1, 2, 3, 4]
* offset = 3
* fill_values = @
* return = [@, @, @, 0, 1]
* -------------------------------------------------
* input = [5, 4, 3, 2, 1]
* offset = -2
* fill_values = 7
* return = [3, 2, 1, 7, 7]
* @endcode
*
* @note if the input is nullable, the output will be nullable.
* @note if the fill value is null, the output will be nullable.
*
* @param input Column to be shifted.
* @param offset The offset by which to shift the input.
* @param fill_value Fill value for indeterminable outputs.
* @param mr Device memory resource used to allocate the returned result's device memory
*
* @throw cudf::logic_error if @p input dtype is not fixed-with.
* @throw cudf::logic_error if @p fill_value dtype does not match @p input dtype.
*/
std::unique_ptr<column> shift(
column_view const& input,
size_type offset,
scalar const& fill_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Slices a `column_view` into a set of `column_view`s according to a set of indices.
*
Expand Down Expand Up @@ -479,7 +519,6 @@ struct contiguous_split_result {
* @param input View of a table to split
* @param splits A vector of indices where the view will be split
* @param[in] mr Device memory resource used to allocate the returned result's device memory
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
* @return The set of requested views of `input` indicated by the `splits` and the viewed memory
* buffer.
*/
Expand Down Expand Up @@ -513,46 +552,6 @@ std::unique_ptr<column> copy_if_else(
column_view const& boolean_mask,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Creates a new column by shifting all values by an offset.
*
* @ingroup copy_shift
*
* Elements will be determined by `output[idx] = input[idx - offset]`.
* Some elements in the output may be indeterminable from the input. For those
* elements, the value will be determined by `fill_values`.
*
* @code{.pseudo}
* Examples
* -------------------------------------------------
* input = [0, 1, 2, 3, 4]
* offset = 3
* fill_values = @
* return = [@, @, @, 0, 1]
* -------------------------------------------------
* input = [5, 4, 3, 2, 1]
* offset = -2
* fill_values = 7
* return = [3, 2, 1, 7, 7]
* @endcode
*
* @note if the input is nullable, the output will be nullable.
* @note if the fill value is null, the output will be nullable.
*
* @param input Column to be shifted.
* @param offset The offset by which to shift the input.
* @param fill_value Fill value for indeterminable outputs.
*
* @throw cudf::logic_error if @p input dtype is not fixed-with.
* @throw cudf::logic_error if @p fill_value dtype does not match @p input dtype.
*/
std::unique_ptr<column> shift(
column_view const& input,
size_type offset,
scalar const& fill_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
cudaStream_t stream = 0);

/**
* @brief Returns a new column, where each element is selected from either @p lhs or
* @p rhs based on the value of the corresponding element in @p boolean_mask
Expand Down
Loading