Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strings batch construction #16727

Draft
wants to merge 12 commits into
base: branch-24.10
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ ConfigureNVBench(
string/join_strings.cpp
string/lengths.cpp
string/like.cpp
string/make_strings_column.cu
string/replace_re.cpp
string/reverse.cpp
string/split.cpp
Expand Down
116 changes: 116 additions & 0 deletions cpp/benchmarks/string/make_strings_column.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>

#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/pair.h>
#include <thrust/tabulate.h>

#include <nvbench/nvbench.cuh>

#include <vector>

static void BM_make_strings_column(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const has_nulls = static_cast<bool>(state.get_int64("has_nulls"));

constexpr int min_row_width = 0;
constexpr int max_row_width = 50;
data_profile const table_profile =
data_profile_builder()
.distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
.null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
auto const data_table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);

using string_index_pair = thrust::pair<char const*, cudf::size_type>;
auto const stream = cudf::get_default_stream();
auto input = rmm::device_uvector<string_index_pair>(data_table->num_rows(), stream);
auto const d_data_ptr =
cudf::column_device_view::create(data_table->get_column(0).view(), stream);
thrust::tabulate(rmm::exec_policy(stream),
input.begin(),
input.end(),
[data_col = *d_data_ptr] __device__(auto const idx) {
auto const row = data_col.element<cudf::string_view>(idx);
return string_index_pair{row.data(), row.size_bytes()};
});

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
[[maybe_unused]] auto const output = cudf::make_strings_column(input, stream);
});
}

static void BM_make_strings_column_batch(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const has_nulls = static_cast<bool>(state.get_int64("has_nulls"));
auto const batch_size = static_cast<cudf::size_type>(state.get_int64("batch_size"));

constexpr int min_row_width = 0;
constexpr int max_row_width = 50;
data_profile const table_profile =
data_profile_builder()
.distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
.null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
auto const data_table = create_random_table(
cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile);

using string_index_pair = thrust::pair<char const*, cudf::size_type>;
auto const stream = cudf::get_default_stream();
auto input_data = std::vector<rmm::device_uvector<string_index_pair>>{};
auto input = std::vector<cudf::device_span<string_index_pair const>>{};
input_data.reserve(batch_size);
input.reserve(batch_size);
for (auto i = 0; i < batch_size; ++i) {
auto const d_data_ptr =
cudf::column_device_view::create(data_table->get_column(i).view(), stream);
auto batch_input = rmm::device_uvector<string_index_pair>(data_table->num_rows(), stream);
thrust::tabulate(rmm::exec_policy(stream),
batch_input.begin(),
batch_input.end(),
[data_col = *d_data_ptr] __device__(auto const idx) {
auto const row = data_col.element<cudf::string_view>(idx);
return string_index_pair{row.data(), row.size_bytes()};
});
input_data.emplace_back(std::move(batch_input));
input.emplace_back(input_data.back());
}

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
[[maybe_unused]] auto const output = cudf::make_strings_column_batch(input, stream);
});
}

NVBENCH_BENCH(BM_make_strings_column)
.set_name("make_strings_column")
.add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000, 200'000'000})
.add_int64_axis("has_nulls", {0, 1});

NVBENCH_BENCH(BM_make_strings_column_batch)
.set_name("make_strings_column_batch")
.add_int64_axis("num_rows", {1'000'000, 10'000'000, 20'000'000})
.add_int64_axis("has_nulls", {0, 1})
.add_int64_axis("batch_size", {5, 10, 15, 20});
12 changes: 12 additions & 0 deletions cpp/include/cudf/column/column_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,18 @@ std::unique_ptr<column> make_strings_column(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

/**
* @brief TODO
* @param strings_batch
* @param stream
* @param mr
* @return
*/
std::vector<std::unique_ptr<column>> make_strings_column_batch(
std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> strings_batch,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

/**
* @brief Construct a STRING type column given a device span of string_view.
*
Expand Down
22 changes: 22 additions & 0 deletions cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,28 @@ static sizes_to_offsets_iterator<ScanIterator, LastType> make_sizes_to_offsets_i
return sizes_to_offsets_iterator<ScanIterator, LastType>(begin, end, last);
}


template <typename SizesIterator, typename OffsetsIterator>
auto sizes_to_offsets_batch(SizesIterator begin,
sdrp713 marked this conversation as resolved.
Show resolved Hide resolved
SizesIterator end,
OffsetsIterator result,
rmm::cuda_stream_view stream)
{
using SizeType = typename thrust::iterator_traits<SizesIterator>::value_type;
static_assert(std::is_integral_v<SizeType>,
"Only numeric types are supported by sizes_to_offsets");

using LastType = std::conditional_t<std::is_signed_v<SizeType>, int64_t, uint64_t>;
auto last_element = rmm::device_scalar<LastType>(0, stream);
auto output_itr =
make_sizes_to_offsets_iterator(result, result + std::distance(begin, end), last_element.data());
// This function uses the type of the initialization parameter as the accumulator type
// when computing the individual scan output elements.
thrust::exclusive_scan(rmm::exec_policy(stream), begin, end, output_itr, LastType{0});
sdrp713 marked this conversation as resolved.
Show resolved Hide resolved
return last_element;
}


/**
* @brief Perform an exclusive-scan and capture the final element value
*
Expand Down
42 changes: 42 additions & 0 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -188,5 +188,47 @@ CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1,
}
}

template <typename InputIterator, typename Predicate>
std::pair<std::vector<rmm::device_buffer>, std::vector<size_type>> valid_if_n_kernel(std::vector<InputIterator> strings,
sdrp713 marked this conversation as resolved.
Show resolved Hide resolved
std::vector<int64_t> sizes,
Predicate p,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
rmm::device_uvector<size_type> valid_counts(strings.size(), stream);

rmm::device_uvector<rmm::device_buffer> null_masks(strings.size(), stream);
sdrp713 marked this conversation as resolved.
Show resolved Hide resolved

thrust::transform(
sizes.begin(),
sizes.end(),
null_masks.begin(),
[stream, mr] __device__ (auto & size) {
return cudf::create_null_mask(size, mask_state::UNINITIALIZED, stream, mr);
}
);
constexpr size_type block_size{256};
grid_1d grid{strings.size(), block_size};
valid_if_n_kernel<block_size><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>
(strings.begin(), InputIterator2 begin2, p, static_cast<bitmask_type*>(null_masks.data()), strings.size(), 8, valid_counts.data());
sdrp713 marked this conversation as resolved.
Show resolved Hide resolved

std::vector<size_type> host_valid_counts = make_std_vector_async(valid_counts, stream);

std::vector<size_type> host_null_masks = make_std_vector_async(null_masks, stream);

std::vector<size_type> null_counts(strings.size());

std::transform(
thrust::make_zip_iterator(thrust::make_tuple(host_valid_counts.begin(), sizes.begin())),
thrust::make_zip_iterator(thrust::make_tuple(host_valid_counts.end(), sizes.end())),
null_counts.begin(),
[] (auto & elem) {
return thrust::get<1>(elem) - thrust::get<0>(elem);
}
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are all unnecessary. Just return the vector of null masks and the device_uvector of valid_counts. So there will not be any stream sync in this function. This function (valid_if_n_kernel) should not have that name. Instead, we can call it batch_valid_if_async and put it in the source file strings_column_factories.cu.


return std::pair(host_null_masks, null_counts);
}

} // namespace detail
} // namespace cudf
67 changes: 67 additions & 0 deletions cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,73 @@ namespace cudf {
namespace strings {
namespace detail {

std::pair<std::vector<std::unique_ptr<column>>, std::vector<int64_t>> make_offsets_child_column_batch(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should also be put in strings_column_factories.cu too, because this function is not designed to be used anywhere else except for the column batch construction.

std::vector<thrust::transform_iterator<size_type> offsets_transformer_itr,
std::vector<size_type> strings_sizes,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
std::vector<std::unique_ptr<column>> offsets_columns;
rmm::device_uvector<rmm::device_scalar<int64_t>> total_bytes(offsets_transformer_itr.size(), stream, mr);
sdrp713 marked this conversation as resolved.
Show resolved Hide resolved
auto constexpr size_type_max = static_cast<int64_t>(std::numeric_limits<size_type>::max());

std::transform (
thrust::make_zip_iterator(thrust::make_tuple(offsets_transformer_itr.begin(), strings_sizes.begin())),
thrust::make_zip_iterator(thrust::make_tuple(offsets_transformer_itr.end(), strings_sizes.end())),
std::back_inserter(offsets_columns),
[stream, mr] (auto &elem) {
auto const lcount = static_cast<int64_t>(thrust::get<1>(elem));
auto const strings_count = static_cast<size_type>(lcount);
return make_numeric_column(data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
}
);

std::transform (
thrust::make_zip_iterator(thrust::make_tuple(offsets_transformer_itr.begin(), strings_sizes.begin(), offsets_columns.begin())),
thrust::make_zip_iterator(thrust::make_tuple(offsets_transformer_itr.end(), strings_sizes.end(), offsets_columns.end())),
std::back_inserter(total_bytes),
[] (auto &elem) {
auto d_offsets = thrust::get<2>(elem)->mutable_view().template data<int32_t>();
auto map_fn = cuda::proclaim_return_type<size_type>(
[begin = thrust::get<0>(elem), strings_count = thrust::get<1>(elem)] __device__(size_type idx) -> size_type {
return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
}
);
auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
return cudf::detail::sizes_to_offsets_batch(input_itr, input_itr + thrust::get<1>(elem) + 1, d_offsets, stream);
}
);

auto host_total_bytes = make_std_vector_async(total_bytes, stream);

auto const threshold = cudf::strings::get_offset64_threshold();
std::for_each (
thrust::make_zip_iterator(thrust::make_tuple(host_total_bytes.begin(), strings_sizes.begin(), offsets_columns.begin(), offsets_transformer_itr.begin())),
thrust::make_zip_iterator(thrust::make_tuple(host_total_bytes.end(), strings_sizes.end(), offsets_columns.end(), offsets_transformer_itr.end())),
[threshold, stream, mr] (auto &elem) {
CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || (thrust::get<0>(elem) < threshold),
"Size of output exceeds the column size limit",
std::overflow_error);
if (thrust::get<0>(elem) >= cudf::strings::get_offset64_threshold()) {
// recompute as int64 offsets when above the threshold
auto map_fn = cuda::proclaim_return_type<size_type>(
[begin = thrust::get<3>(elem), strings_count = thrust::get<1>(elem)] __device__(size_type idx) -> size_type {
return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
}
);
auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
*(thrust::get<2>(elem)) = make_numeric_column(
data_type{type_id::INT64}, thrust::get<1>(elem) + 1, mask_state::UNALLOCATED, stream, mr);
auto d_offsets64 = (*(thrust::get<2>(elem)))->mutable_view().template data<int64_t>();
cudf::detail::sizes_to_offsets(input_itr, input_itr + thrust::get<1>(elem) + 1, d_offsets64, stream);
}
}
);

return std::pair(offsets_column, total_bytes);
}


/**
* @brief Create an offsets column to be a child of a compound column
*
Expand Down
Loading
Loading