Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strings batch construction #16727

Draft
wants to merge 12 commits into
base: branch-24.10
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ ConfigureNVBench(
string/join_strings.cpp
string/lengths.cpp
string/like.cpp
string/make_strings_column.cu
string/replace_re.cpp
string/reverse.cpp
string/split.cpp
Expand Down
116 changes: 116 additions & 0 deletions cpp/benchmarks/string/make_strings_column.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>

#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/pair.h>
#include <thrust/tabulate.h>

#include <nvbench/nvbench.cuh>

#include <vector>

static void BM_make_strings_column(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const has_nulls = static_cast<bool>(state.get_int64("has_nulls"));

constexpr int min_row_width = 0;
constexpr int max_row_width = 50;
data_profile const table_profile =
data_profile_builder()
.distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
.null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
auto const data_table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);

using string_index_pair = thrust::pair<char const*, cudf::size_type>;
auto const stream = cudf::get_default_stream();
auto input = rmm::device_uvector<string_index_pair>(data_table->num_rows(), stream);
auto const d_data_ptr =
cudf::column_device_view::create(data_table->get_column(0).view(), stream);
thrust::tabulate(rmm::exec_policy(stream),
input.begin(),
input.end(),
[data_col = *d_data_ptr] __device__(auto const idx) {
auto const row = data_col.element<cudf::string_view>(idx);
return string_index_pair{row.data(), row.size_bytes()};
});

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
[[maybe_unused]] auto const output = cudf::make_strings_column(input, stream);
});
}

static void BM_make_strings_column_batch(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const has_nulls = static_cast<bool>(state.get_int64("has_nulls"));
auto const batch_size = static_cast<cudf::size_type>(state.get_int64("batch_size"));

constexpr int min_row_width = 0;
constexpr int max_row_width = 50;
data_profile const table_profile =
data_profile_builder()
.distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
.null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
auto const data_table = create_random_table(
cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile);

using string_index_pair = thrust::pair<char const*, cudf::size_type>;
auto const stream = cudf::get_default_stream();
auto input_data = std::vector<rmm::device_uvector<string_index_pair>>{};
auto input = std::vector<cudf::device_span<string_index_pair const>>{};
input_data.reserve(batch_size);
input.reserve(batch_size);
for (auto i = 0; i < batch_size; ++i) {
auto const d_data_ptr =
cudf::column_device_view::create(data_table->get_column(i).view(), stream);
auto batch_input = rmm::device_uvector<string_index_pair>(data_table->num_rows(), stream);
thrust::tabulate(rmm::exec_policy(stream),
batch_input.begin(),
batch_input.end(),
[data_col = *d_data_ptr] __device__(auto const idx) {
auto const row = data_col.element<cudf::string_view>(idx);
return string_index_pair{row.data(), row.size_bytes()};
});
input_data.emplace_back(std::move(batch_input));
input.emplace_back(input_data.back());
}

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
[[maybe_unused]] auto const output = cudf::make_strings_column_batch(input, stream);
});
}

NVBENCH_BENCH(BM_make_strings_column)
.set_name("make_strings_column")
.add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000, 200'000'000})
.add_int64_axis("has_nulls", {0, 1});

NVBENCH_BENCH(BM_make_strings_column_batch)
.set_name("make_strings_column_batch")
.add_int64_axis("num_rows", {1'000'000, 10'000'000, 20'000'000})
.add_int64_axis("has_nulls", {0, 1})
.add_int64_axis("batch_size", {5, 10, 15, 20});
12 changes: 12 additions & 0 deletions cpp/include/cudf/column/column_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,18 @@ std::unique_ptr<column> make_strings_column(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

/**
* @brief TODO
* @param strings_batch
* @param stream
* @param mr
* @return
*/
std::vector<std::unique_ptr<column>> make_strings_column_batch(
std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> strings_batch,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

/**
* @brief Construct a STRING type column given a device span of string_view.
*
Expand Down
37 changes: 37 additions & 0 deletions cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,43 @@ static sizes_to_offsets_iterator<ScanIterator, LastType> make_sizes_to_offsets_i
return sizes_to_offsets_iterator<ScanIterator, LastType>(begin, end, last);
}

void sizes_to_offsets_batch(std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> strings_batch,
std::vector<std::unique_ptr<column>> offsets_columns,
int64_t *total_bytes,
rmm::cuda_stream_view stream)
{
std::for_each_n (
thrust::make_zip_iterator(thrust::make_tuple(strings_batch.begin(), offsets_columns.begin(), thrust::make_counting_iterator(0))),
thrust::make_zip_iterator(thrust::make_tuple(strings_batch.end(), offsets_columns.end(), thrust::make_counting_iterator(strings_batch.size()))),
[total_bytes = total_bytes, stream] (auto &elem) {
auto offsets_transformer =
cuda::proclaim_return_type<size_type>([] (auto item) -> size_type {
return (item.first != nullptr ? static_cast<size_type>(thrust::get<1>(item)) : size_type{0});
});

auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::get<0>(elem), offsets_transformer);
auto d_offsets = thrust::get<1>(elem)->mutable_view().template data<int32_t>();
auto strings_count = thrust::get<1>(thrust::get<0>(elem));

auto map_fn = cuda::proclaim_return_type<size_type>(
[begin = offsets_transformer_itr, strings_count = strings_count] (size_type idx) -> size_type {
return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
}
);

auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);

auto output_itr =
make_sizes_to_offsets_iterator(d_offsets, d_offsets + std::distance(input_itr, input_itr + strings_count + 1), total_bytes + thrust::get<2>(elem));
// This function uses the type of the initialization parameter as the accumulator type
// when computing the individual scan output elements.
thrust::exclusive_scan(rmm::exec_policy_nosync(stream), input_itr, input_itr + strings_count + 1, output_itr, 0);

}
);
}


/**
* @brief Perform an exclusive-scan and capture the final element value
*
Expand Down
32 changes: 32 additions & 0 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/bit.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -188,5 +189,36 @@ CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1,
}
}

template <typename InputIterator, typename Predicate>
std::pair<std::vector<bitmask_type*>, rmm::device_uvector<size_type>> valid_if_n_kernel(std::vector<InputIterator> strings,
std::vector<int64_t> sizes,
Predicate p,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
rmm::device_uvector<size_type> valid_counts(strings.size(), stream);

std::vector<bitmask_type*> null_masks(strings.size(), stream);

thrust::transform(
sizes.begin(),
sizes.end(),
null_masks.begin(),
[stream, mr] __device__ (auto & size) {
return static_cast<bitmask_type*>(cudf::create_null_mask(size, mask_state::UNINITIALIZED, stream, mr).data());
}
);

auto device_null_masks = cudf::detail::make_device_uvector_async(null_masks, stream, mr);

auto counting_iter = thrust::make_counting_iterator(0);
constexpr size_type block_size{256};
grid_1d grid{strings.size(), block_size};
valid_if_n_kernel<block_size><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>
(counting_iter, counting_iter, p, device_null_masks.data(), strings.size(), 8, valid_counts.data());

return std::pair(null_masks, valid_counts);
}

} // namespace detail
} // namespace cudf
63 changes: 63 additions & 0 deletions cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/utilities.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand All @@ -28,6 +29,7 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>
#include <rmm/resource_ref.hpp>
#include <rmm/device_uvector.hpp>

#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
Expand All @@ -38,6 +40,67 @@ namespace cudf {
namespace strings {
namespace detail {

std::pair<std::vector<std::unique_ptr<column>>, std::vector<int64_t>> make_offsets_child_column_batch(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should also be put in strings_column_factories.cu too, because this function is not designed to be used anywhere else except for the column batch construction.

std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> strings_batch,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
std::vector<std::unique_ptr<column>> offsets_columns(strings_batch.size());
//std::vector<std::unique_ptr<column>> out_offsets_columns(strings_batch.size());
rmm::device_uvector<int64_t> total_bytes(strings_batch.size(), stream);

std::transform (
strings_batch.begin(),
strings_batch.end(),
std::back_inserter(offsets_columns),
[stream, mr] (auto &elem) {
return make_numeric_column(data_type{type_id::INT32}, thrust::get<1>(elem) + 1, mask_state::UNALLOCATED, stream, mr);
}
);

sizes_to_offsets_batch(strings_batch, offsets_columns, total_bytes.data(), stream);

auto host_total_bytes = cudf::detail::make_std_vector_async(total_bytes, stream);

auto const threshold = cudf::strings::get_offset64_threshold();
thrust::for_each (
thrust::make_zip_iterator(thrust::make_tuple(host_total_bytes.begin(), offsets_columns.begin(), strings_batch.begin())),
thrust::make_zip_iterator(thrust::make_tuple(host_total_bytes.end(), offsets_columns.end(), strings_batch.end())),
[threshold, stream, mr] (auto &elem) {
auto offsets_transformer =
cuda::proclaim_return_type<size_type>([] (auto item) -> size_type {
return (item.first != nullptr ? static_cast<size_type>(thrust::get<1>(item)) : size_type{0});
});

auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::get<2>(elem), offsets_transformer);
auto strings_count = thrust::get<1>(thrust::get<0>(elem));

if (thrust::get<0>(elem) >= cudf::strings::get_offset64_threshold()) {
// recompute as int64 offsets when above the threshold
auto map_fn = cuda::proclaim_return_type<size_type>(
[begin = offsets_transformer_itr, strings_count = strings_count] (size_type idx) -> size_type {
return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
}
);

auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);

thrust::get<1>(elem) = make_numeric_column(
data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);

auto d_offsets64 = (thrust::get<1>(elem))->mutable_view().template data<int64_t>();

cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);

thrust::get<1>(elem) = std::move(thrust::get<1>(elem));
}
}
);

return std::pair(offsets_columns, host_total_bytes);
}


/**
* @brief Create an offsets column to be a child of a compound column
*
Expand Down
Loading
Loading