Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update bitmask_and and bitmask_or to return a pair of resulting mask and count of unset bits #9616

Merged
merged 17 commits into from
Nov 11, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 57 additions & 40 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,34 @@
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>

namespace cudf {
namespace detail {
/**
* @brief Computes the merger of an array of bitmasks using a binary operator
*
* @tparam block_size Number of threads in each thread block
* @tparam Binop Type of binary operator
*
* @param op The binary operator used to combine the bitmasks
* @param destination The bitmask to write result into
* @param source Array of source mask pointers. All masks must be of same size
* @param source_begin_bits Array of offsets into corresponding @p source masks.
* Must be same size as source array
* @param source_size_bits Number of bits in each mask in @p source
* @param count Pointer to valid-bit counter
*/
template <typename Binop>
template <int block_size, typename Binop>
__global__ void offset_bitmask_binop(Binop op,
device_span<bitmask_type> destination,
device_span<bitmask_type const*> source,
device_span<size_type const> source_begin_bits,
size_type source_size_bits)
size_type source_size_bits,
size_type* valid_count_ptr)
{
size_type thread_valid_count = 0;

for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
destination_word_index < destination.size();
destination_word_index += blockDim.x * gridDim.x) {
Expand All @@ -52,17 +60,22 @@ __global__ void offset_bitmask_binop(Binop op,
source_begin_bits[0],
source_begin_bits[0] + source_size_bits);
for (size_type i = 1; i < source.size(); i++) {
destination_word =

op(destination_word,
detail::get_mask_offset_word(source[i],
destination_word_index,
source_begin_bits[i],
source_begin_bits[i] + source_size_bits));
destination_word = op(destination_word,
detail::get_mask_offset_word(source[i],
destination_word_index,
source_begin_bits[i],
source_begin_bits[i] + source_size_bits));
}

destination[destination_word_index] = destination_word;
thread_valid_count += __popc(destination_word);
}

using BlockReduce = cub::BlockReduce<size_type, block_size>;
__shared__ typename BlockReduce::TempStorage temp_storage;
size_type block_valid_count = BlockReduce(temp_storage).Sum(thread_valid_count);

if (threadIdx.x == 0) { atomicAdd(valid_count_ptr, block_valid_count); }
}

/**
Expand All @@ -72,43 +85,44 @@ __global__ void offset_bitmask_binop(Binop op,
* @param stream CUDA stream used for device memory operations and kernel launches
*/
template <typename Binop>
rmm::device_buffer bitmask_binop(
Binop op,
host_span<bitmask_type const*> masks,
host_span<size_type const> masks_begin_bits,
size_type mask_size_bits,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
bitmask bitmask_binop(Binop op,
host_span<bitmask_type const*> masks,
host_span<size_type const> masks_begin_bits,
size_type mask_size_bits,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr};

inplace_bitmask_binop(op,
device_span<bitmask_type>(static_cast<bitmask_type*>(dest_mask.data()),
num_bitmask_words(mask_size_bits)),
masks,
masks_begin_bits,
mask_size_bits,
stream,
mr);

return dest_mask;
auto valid_count =
inplace_bitmask_binop(op,
device_span<bitmask_type>(static_cast<bitmask_type*>(dest_mask.data()),
num_bitmask_words(mask_size_bits)),
masks,
masks_begin_bits,
mask_size_bits,
stream,
mr);
auto null_count = mask_size_bits - valid_count;

return bitmask{std::move(dest_mask), valid_count, null_count};
}

/**
* @brief Performs a merge of the specified bitmasks using the binary operator
* provided, and writes in place to destination
* provided, writes in place to destination and returns count of valid bits
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
*
* @param op The binary operator used to combine the bitmasks
* @param dest_mask Destination to which the merged result is written
* @param masks The list of data pointers of the bitmasks to be merged
* @param masks_begin_bits The bit offsets from which each mask is to be merged
* @param mask_size_bits The number of bits to be ANDed in each mask
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned device_buffer
* @return rmm::device_buffer Output bitmask
* @param[in] op The binary operator used to combine the bitmasks
* @param[out] dest_mask Destination to which the merged result is written
* @param[in] masks The list of data pointers of the bitmasks to be merged
* @param[in] masks_begin_bits The bit offsets from which each mask is to be merged
* @param[in] mask_size_bits The number of bits to be ANDed in each mask
* @param[in] stream CUDA stream used for device memory operations and kernel launches
* @param[in] mr Device memory resource used to allocate the returned device_buffer
* @return size_type Count of valid bits
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
*/
template <typename Binop>
void inplace_bitmask_binop(
size_type inplace_bitmask_binop(
Binop op,
device_span<bitmask_type> dest_mask,
host_span<bitmask_type const*> masks,
Expand All @@ -124,6 +138,7 @@ void inplace_bitmask_binop(
CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
"Mask pointer cannot be null");

rmm::device_scalar<size_type> d_counter{0, stream, mr};
rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);

Expand All @@ -135,11 +150,13 @@ void inplace_bitmask_binop(
cudaMemcpyHostToDevice,
stream.value()));

cudf::detail::grid_1d config(dest_mask.size(), 256);
offset_bitmask_binop<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
op, dest_mask, d_masks, d_begin_bits, mask_size_bits);
auto constexpr block_size = 256;
cudf::detail::grid_1d config(dest_mask.size(), block_size);
offset_bitmask_binop<block_size>
<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
op, dest_mask, d_masks, d_begin_bits, mask_size_bits, d_counter.data());
CHECK_CUDA(stream.value());
stream.synchronize();
return d_counter.value(stream);
}

/**
Expand Down
21 changes: 11 additions & 10 deletions cpp/include/cudf/detail/null_mask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
#include <vector>

namespace cudf {

struct bitmask;

namespace detail {

/**
Expand Down Expand Up @@ -114,22 +117,20 @@ rmm::device_buffer copy_bitmask(
*
* @param stream CUDA stream used for device memory operations and kernel launches
*/
rmm::device_buffer bitmask_and(
host_span<bitmask_type const*> masks,
host_span<size_type const> masks_begin_bits,
size_type mask_size_bits,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
bitmask bitmask_and(host_span<bitmask_type const*> masks,
host_span<size_type const> masks_begin_bits,
size_type mask_size_bits,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::bitmask_and
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
rmm::device_buffer bitmask_and(
table_view const& view,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
bitmask bitmask_and(table_view const& view,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::bitmask_or
Expand Down
19 changes: 14 additions & 5 deletions cpp/include/cudf/null_mask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@

namespace cudf {

/**
* @brief Bitmask output type.
*/
struct bitmask {
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
rmm::device_buffer mask; ///< Resulting bitmask
size_type num_set_bits; ///< Number of set bits
size_type num_unset_bits; ///< Number of unset bits
};

/**
* @addtogroup column_nullmask
* @{
Expand Down Expand Up @@ -202,18 +211,18 @@ rmm::device_buffer copy_bitmask(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a bitwise AND of the bitmasks of columns of a table
* @brief Returns a struct of bitwise AND of the bitmasks of columns of a table,
* count of valid bits and count of null bits
*
* If any of the columns isn't nullable, it is considered all valid.
* If no column in the table is nullable, an empty bitmask is returned.
*
* @param view The table of columns
* @param mr Device memory resource used to allocate the returned device_buffer
* @return rmm::device_buffer Output bitmask
* @return A struct of resulting bitmask, count of valid bits and count of null bits
*/
rmm::device_buffer bitmask_and(
table_view const& view,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
bitmask bitmask_and(table_view const& view,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a bitwise OR of the bitmasks of columns of a table
Expand Down
20 changes: 14 additions & 6 deletions cpp/src/binaryop/binaryop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,13 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
if (binops::is_null_dependent(op)) {
return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
} else {
auto new_mask = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
return make_fixed_width_column(
output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
auto bitmask_output = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
return make_fixed_width_column(output_type,
lhs.size(),
std::move(bitmask_output.mask),
bitmask_output.num_unset_bits,
stream,
mr);
}
};

Expand Down Expand Up @@ -799,9 +803,13 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,

CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match");

auto new_mask = bitmask_and(table_view({lhs, rhs}), stream, mr);
auto out = make_fixed_width_column(
output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
auto bitmask_output = bitmask_and(table_view({lhs, rhs}), stream, mr);
auto out = make_fixed_width_column(output_type,
lhs.size(),
std::move(bitmask_output.mask),
bitmask_output.num_unset_bits,
stream,
mr);

// Check for 0 sized data
if (lhs.is_empty() or rhs.is_empty()) return out;
Expand Down
Loading