Skip to content

Commit

Permalink
Update bitmask_and and bitmask_or to return a pair of resulting m…
Browse files Browse the repository at this point in the history
…ask and count of unset bits (#9616)

Closes #9176

- [x] Update `bitmask_and` and `bitmask_or` to return both resulting mask and count of unset bits
- [x] Refactor related implementations to use new `bitmask_and/or`
- [x] Update unit tests

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Bradley Dice (https://github.com/bdice)
  - Jason Lowe (https://github.com/jlowe)

URL: #9616
  • Loading branch information
PointKernel authored Nov 11, 2021
1 parent 5402787 commit ba2b51d
Show file tree
Hide file tree
Showing 15 changed files with 187 additions and 139 deletions.
103 changes: 66 additions & 37 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,46 +23,71 @@
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>

namespace cudf {
namespace detail {
/**
* @brief Computes the merger of an array of bitmasks using a binary operator
*
* @tparam block_size Number of threads in each thread block
* @tparam Binop Type of binary operator
*
* @param op The binary operator used to combine the bitmasks
* @param destination The bitmask to write result into
* @param source Array of source mask pointers. All masks must be of same size
* @param source_begin_bits Array of offsets into corresponding @p source masks.
* Must be same size as source array
* @param source_size_bits Number of bits in each mask in @p source
* @param count Pointer to counter of set bits
*/
template <typename Binop>
template <int block_size, typename Binop>
__global__ void offset_bitmask_binop(Binop op,
device_span<bitmask_type> destination,
device_span<bitmask_type const*> source,
device_span<size_type const> source_begin_bits,
size_type source_size_bits)
size_type source_size_bits,
size_type* count_ptr)
{
for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
destination_word_index < destination.size();
constexpr auto const word_size{detail::size_in_bits<bitmask_type>()};
auto const tid = threadIdx.x + blockIdx.x * blockDim.x;

size_type thread_count = 0;

for (size_type destination_word_index = tid; destination_word_index < destination.size();
destination_word_index += blockDim.x * gridDim.x) {
bitmask_type destination_word =
detail::get_mask_offset_word(source[0],
destination_word_index,
source_begin_bits[0],
source_begin_bits[0] + source_size_bits);
for (size_type i = 1; i < source.size(); i++) {
destination_word =

op(destination_word,
detail::get_mask_offset_word(source[i],
destination_word_index,
source_begin_bits[i],
source_begin_bits[i] + source_size_bits));
destination_word = op(destination_word,
detail::get_mask_offset_word(source[i],
destination_word_index,
source_begin_bits[i],
source_begin_bits[i] + source_size_bits));
}

destination[destination_word_index] = destination_word;
thread_count += __popc(destination_word);
}

// Subtract any slack bits from the last word
if (tid == 0) {
size_type const last_bit_index = source_size_bits - 1;
size_type const num_slack_bits = word_size - (last_bit_index % word_size) - 1;
if (num_slack_bits > 0) {
size_type const word_index = cudf::word_index(last_bit_index);
thread_count -= __popc(destination[word_index] & set_most_significant_bits(num_slack_bits));
}
}

using BlockReduce = cub::BlockReduce<size_type, block_size>;
__shared__ typename BlockReduce::TempStorage temp_storage;
size_type block_count = BlockReduce(temp_storage).Sum(thread_count);

if (threadIdx.x == 0) { atomicAdd(count_ptr, block_count); }
}

/**
Expand All @@ -72,7 +97,7 @@ __global__ void offset_bitmask_binop(Binop op,
* @param stream CUDA stream used for device memory operations and kernel launches
*/
template <typename Binop>
rmm::device_buffer bitmask_binop(
std::pair<rmm::device_buffer, size_type> bitmask_binop(
Binop op,
host_span<bitmask_type const*> masks,
host_span<size_type const> masks_begin_bits,
Expand All @@ -81,34 +106,35 @@ rmm::device_buffer bitmask_binop(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr};

inplace_bitmask_binop(op,
device_span<bitmask_type>(static_cast<bitmask_type*>(dest_mask.data()),
num_bitmask_words(mask_size_bits)),
masks,
masks_begin_bits,
mask_size_bits,
stream,
mr);

return dest_mask;
auto null_count =
mask_size_bits -
inplace_bitmask_binop(op,
device_span<bitmask_type>(static_cast<bitmask_type*>(dest_mask.data()),
num_bitmask_words(mask_size_bits)),
masks,
masks_begin_bits,
mask_size_bits,
stream,
mr);

return std::make_pair(std::move(dest_mask), null_count);
}

/**
* @brief Performs a merge of the specified bitmasks using the binary operator
* provided, and writes in place to destination
* provided, writes in place to destination and returns count of set bits
*
* @param op The binary operator used to combine the bitmasks
* @param dest_mask Destination to which the merged result is written
* @param masks The list of data pointers of the bitmasks to be merged
* @param masks_begin_bits The bit offsets from which each mask is to be merged
* @param mask_size_bits The number of bits to be ANDed in each mask
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned device_buffer
* @return rmm::device_buffer Output bitmask
* @param[in] op The binary operator used to combine the bitmasks
* @param[out] dest_mask Destination to which the merged result is written
* @param[in] masks The list of data pointers of the bitmasks to be merged
* @param[in] masks_begin_bits The bit offsets from which each mask is to be merged
* @param[in] mask_size_bits The number of bits to be ANDed in each mask
* @param[in] stream CUDA stream used for device memory operations and kernel launches
* @param[in] mr Device memory resource used to allocate the returned device_buffer
* @return size_type Count of set bits
*/
template <typename Binop>
void inplace_bitmask_binop(
size_type inplace_bitmask_binop(
Binop op,
device_span<bitmask_type> dest_mask,
host_span<bitmask_type const*> masks,
Expand All @@ -124,6 +150,7 @@ void inplace_bitmask_binop(
CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
"Mask pointer cannot be null");

rmm::device_scalar<size_type> d_counter{0, stream, mr};
rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);

Expand All @@ -135,11 +162,13 @@ void inplace_bitmask_binop(
cudaMemcpyHostToDevice,
stream.value()));

cudf::detail::grid_1d config(dest_mask.size(), 256);
offset_bitmask_binop<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
op, dest_mask, d_masks, d_begin_bits, mask_size_bits);
auto constexpr block_size = 256;
cudf::detail::grid_1d config(dest_mask.size(), block_size);
offset_bitmask_binop<block_size>
<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
op, dest_mask, d_masks, d_begin_bits, mask_size_bits, d_counter.data());
CHECK_CUDA(stream.value());
stream.synchronize();
return d_counter.value(stream);
}

/**
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/cudf/detail/null_mask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ rmm::device_buffer copy_bitmask(
*
* @param stream CUDA stream used for device memory operations and kernel launches
*/
rmm::device_buffer bitmask_and(
std::pair<rmm::device_buffer, size_type> bitmask_and(
host_span<bitmask_type const*> masks,
host_span<size_type const> masks_begin_bits,
size_type mask_size_bits,
Expand All @@ -126,7 +126,7 @@ rmm::device_buffer bitmask_and(
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
rmm::device_buffer bitmask_and(
std::pair<rmm::device_buffer, size_type> bitmask_and(
table_view const& view,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
Expand All @@ -136,7 +136,7 @@ rmm::device_buffer bitmask_and(
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
rmm::device_buffer bitmask_or(
std::pair<rmm::device_buffer, size_type> bitmask_or(
table_view const& view,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
Expand Down
14 changes: 8 additions & 6 deletions cpp/include/cudf/null_mask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,30 +202,32 @@ rmm::device_buffer copy_bitmask(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a bitwise AND of the bitmasks of columns of a table
* @brief Performs bitwise AND of the bitmasks of columns of a table. Returns
* a pair of resulting mask and count of unset bits.
*
* If any of the columns isn't nullable, it is considered all valid.
* If no column in the table is nullable, an empty bitmask is returned.
*
* @param view The table of columns
* @param mr Device memory resource used to allocate the returned device_buffer
* @return rmm::device_buffer Output bitmask
* @return A pair of resulting bitmask and count of unset bits
*/
rmm::device_buffer bitmask_and(
std::pair<rmm::device_buffer, size_type> bitmask_and(
table_view const& view,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a bitwise OR of the bitmasks of columns of a table
* @brief Performs bitwise OR of the bitmasks of columns of a table. Returns
* a pair of resulting mask and count of unset bits.
*
* If any of the columns isn't nullable, it is considered all valid.
* If no column in the table is nullable, an empty bitmask is returned.
*
* @param view The table of columns
* @param mr Device memory resource used to allocate the returned device_buffer
* @return rmm::device_buffer Output bitmask
* @return A pair of resulting bitmask and count of unset bits
*/
rmm::device_buffer bitmask_or(
std::pair<rmm::device_buffer, size_type> bitmask_or(
table_view const& view,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
10 changes: 5 additions & 5 deletions cpp/src/binaryop/binaryop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,9 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
if (binops::is_null_dependent(op)) {
return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
} else {
auto new_mask = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
auto [new_mask, null_count] = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
return make_fixed_width_column(
output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
output_type, lhs.size(), std::move(new_mask), null_count, stream, mr);
}
};

Expand Down Expand Up @@ -799,9 +799,9 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,

CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match");

auto new_mask = bitmask_and(table_view({lhs, rhs}), stream, mr);
auto out = make_fixed_width_column(
output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
auto [new_mask, null_count] = bitmask_and(table_view({lhs, rhs}), stream, mr);
auto out =
make_fixed_width_column(output_type, lhs.size(), std::move(new_mask), null_count, stream, mr);

// Check for 0 sized data
if (lhs.is_empty() or rhs.is_empty()) return out;
Expand Down
Loading

0 comments on commit ba2b51d

Please sign in to comment.