rapidsai · sdrp713 · Aug 2, 2024 · Aug 2, 2024 · Aug 2, 2024 · Aug 5, 2024
@@ -339,6 +339,7 @@ ConfigureNVBench(
   string/join_strings.cpp
   string/lengths.cpp
   string/like.cpp
+  string/make_strings_column.cu
   string/replace_re.cpp
   string/reverse.cpp
   string/split.cpp

@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/pair.h>
+#include <thrust/tabulate.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+static void BM_make_strings_column(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const has_nulls = static_cast<bool>(state.get_int64("has_nulls"));
+
+  constexpr int min_row_width = 0;
+  constexpr int max_row_width = 50;
+  data_profile const table_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
+      .null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
+  auto const data_table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+
+  using string_index_pair = thrust::pair<char const*, cudf::size_type>;
+  auto const stream       = cudf::get_default_stream();
+  auto input              = rmm::device_uvector<string_index_pair>(data_table->num_rows(), stream);
+  auto const d_data_ptr =
+    cudf::column_device_view::create(data_table->get_column(0).view(), stream);
+  thrust::tabulate(rmm::exec_policy(stream),
+                   input.begin(),
+                   input.end(),
+                   [data_col = *d_data_ptr] __device__(auto const idx) {
+                     auto const row = data_col.element<cudf::string_view>(idx);
+                     return string_index_pair{row.data(), row.size_bytes()};
+                   });
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto const output = cudf::make_strings_column(input, stream);
+  });
+}
+
+static void BM_make_strings_column_batch(nvbench::state& state)
+{
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const has_nulls  = static_cast<bool>(state.get_int64("has_nulls"));
+  auto const batch_size = static_cast<cudf::size_type>(state.get_int64("batch_size"));
+
+  constexpr int min_row_width = 0;
+  constexpr int max_row_width = 50;
+  data_profile const table_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
+      .null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
+  auto const data_table = create_random_table(
+    cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile);
+
+  using string_index_pair = thrust::pair<char const*, cudf::size_type>;
+  auto const stream       = cudf::get_default_stream();
+  auto input_data         = std::vector<rmm::device_uvector<string_index_pair>>{};
+  auto input              = std::vector<cudf::device_span<string_index_pair const>>{};
+  input_data.reserve(batch_size);
+  input.reserve(batch_size);
+  for (auto i = 0; i < batch_size; ++i) {
+    auto const d_data_ptr =
+      cudf::column_device_view::create(data_table->get_column(i).view(), stream);
+    auto batch_input = rmm::device_uvector<string_index_pair>(data_table->num_rows(), stream);
+    thrust::tabulate(rmm::exec_policy(stream),
+                     batch_input.begin(),
+                     batch_input.end(),
+                     [data_col = *d_data_ptr] __device__(auto const idx) {
+                       auto const row = data_col.element<cudf::string_view>(idx);
+                       return string_index_pair{row.data(), row.size_bytes()};
+                     });
+    input_data.emplace_back(std::move(batch_input));
+    input.emplace_back(input_data.back());
+  }
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto const output = cudf::make_strings_column_batch(input, stream);
+  });
+}
+
+NVBENCH_BENCH(BM_make_strings_column)
+  .set_name("make_strings_column")
+  .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000, 200'000'000})
+  .add_int64_axis("has_nulls", {0, 1});
+
+NVBENCH_BENCH(BM_make_strings_column_batch)
+  .set_name("make_strings_column_batch")
+  .add_int64_axis("num_rows", {1'000'000, 10'000'000, 20'000'000})
+  .add_int64_axis("has_nulls", {0, 1})
+  .add_int64_axis("batch_size", {5, 10, 15, 20});
@@ -379,6 +379,18 @@ std::unique_ptr<column> make_strings_column(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief TODO
+ * @param strings_batch
+ * @param stream
+ * @param mr
+ * @return
+ */
+std::vector<std::unique_ptr<column>> make_strings_column_batch(
+  std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> strings_batch,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Construct a STRING type column given a device span of string_view.
  *

@@ -226,6 +226,43 @@ static sizes_to_offsets_iterator<ScanIterator, LastType> make_sizes_to_offsets_i
   return sizes_to_offsets_iterator<ScanIterator, LastType>(begin, end, last);
 }
 
+void sizes_to_offsets_batch(std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> strings_batch,
+                      std::vector<std::unique_ptr<column>> offsets_columns,
+                      int64_t *total_bytes,
+                      rmm::cuda_stream_view stream)
+{
+  std::for_each_n (
+    thrust::make_zip_iterator(thrust::make_tuple(strings_batch.begin(), offsets_columns.begin(), thrust::make_counting_iterator(0))),
+    thrust::make_zip_iterator(thrust::make_tuple(strings_batch.end(), offsets_columns.end(), thrust::make_counting_iterator(strings_batch.size()))),
+    [total_bytes = total_bytes, stream] (auto &elem) {
+      auto offsets_transformer =
+        cuda::proclaim_return_type<size_type>([] (auto item) -> size_type {
+          return (item.first != nullptr ? static_cast<size_type>(thrust::get<1>(item)) : size_type{0});
+        });
+
+      auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::get<0>(elem), offsets_transformer);
+      auto d_offsets = thrust::get<1>(elem)->mutable_view().template data<int32_t>();
+      auto strings_count = thrust::get<1>(thrust::get<0>(elem));
+
+      auto map_fn = cuda::proclaim_return_type<size_type>(
+        [begin = offsets_transformer_itr, strings_count = strings_count] (size_type idx) -> size_type {
+          return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
+        }
+      );
+
+      auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
+
+      auto output_itr =
+        make_sizes_to_offsets_iterator(d_offsets, d_offsets + std::distance(input_itr, input_itr + strings_count + 1), total_bytes + thrust::get<2>(elem));
+      // This function uses the type of the initialization parameter as the accumulator type
+      // when computing the individual scan output elements.
+      thrust::exclusive_scan(rmm::exec_policy_nosync(stream), input_itr, input_itr + strings_count + 1, output_itr, 0);
+
+    }
+  );
+}
+
+
 /**
  * @brief Perform an exclusive-scan and capture the final element value
  *

@@ -18,6 +18,7 @@
 
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -188,5 +189,36 @@ CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1,
   }
 }
 
+template <typename InputIterator, typename Predicate>
+std::pair<std::vector<bitmask_type*>, rmm::device_uvector<size_type>> valid_if_n_kernel(std::vector<InputIterator> strings,
+                                                  std::vector<int64_t> sizes,
+                                                  Predicate p,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::device_async_resource_ref mr)
+{
+  rmm::device_uvector<size_type> valid_counts(strings.size(), stream);
+
+  std::vector<bitmask_type*> null_masks(strings.size(), stream);
+
+  thrust::transform(
+    sizes.begin(),
+    sizes.end(),
+    null_masks.begin(),
+    [stream, mr] __device__ (auto & size) {
+      return static_cast<bitmask_type*>(cudf::create_null_mask(size, mask_state::UNINITIALIZED, stream, mr).data());
+    }
+  );
+
+  auto device_null_masks = cudf::detail::make_device_uvector_async(null_masks, stream, mr);
+
+  auto counting_iter = thrust::make_counting_iterator(0);
+  constexpr size_type block_size{256};
+  grid_1d grid{strings.size(), block_size};
+  valid_if_n_kernel<block_size><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>
+  (counting_iter, counting_iter, p, device_null_masks.data(),  strings.size(),  8, valid_counts.data());
+
+  return std::pair(null_masks, valid_counts);
+}
+
 }  // namespace detail
 }  // namespace cudf
@@ -20,6 +20,7 @@
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -28,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -38,6 +40,67 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+std::pair<std::vector<std::unique_ptr<column>>, std::vector<int64_t>> make_offsets_child_column_batch(
+  std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> strings_batch,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  std::vector<std::unique_ptr<column>> offsets_columns(strings_batch.size());
+  //std::vector<std::unique_ptr<column>> out_offsets_columns(strings_batch.size());
+  rmm::device_uvector<int64_t> total_bytes(strings_batch.size(), stream);
+
+  std::transform (
+    strings_batch.begin(),
+    strings_batch.end(),
+    std::back_inserter(offsets_columns),
+    [stream, mr] (auto &elem) {
+      return make_numeric_column(data_type{type_id::INT32}, thrust::get<1>(elem) + 1, mask_state::UNALLOCATED, stream, mr);
+    }
+  );
+
+  sizes_to_offsets_batch(strings_batch, offsets_columns, total_bytes.data(), stream);
+
+  auto host_total_bytes = cudf::detail::make_std_vector_async(total_bytes, stream);
+
+  auto const threshold = cudf::strings::get_offset64_threshold();
+  thrust::for_each (
+    thrust::make_zip_iterator(thrust::make_tuple(host_total_bytes.begin(), offsets_columns.begin(), strings_batch.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(host_total_bytes.end(), offsets_columns.end(), strings_batch.end())),
+    [threshold, stream, mr] (auto &elem) {
+      auto offsets_transformer =
+        cuda::proclaim_return_type<size_type>([] (auto item) -> size_type {
+          return (item.first != nullptr ? static_cast<size_type>(thrust::get<1>(item)) : size_type{0});
+        });
+
+      auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::get<2>(elem), offsets_transformer);
+      auto strings_count = thrust::get<1>(thrust::get<0>(elem));
+
+      if (thrust::get<0>(elem) >= cudf::strings::get_offset64_threshold()) {
+        // recompute as int64 offsets when above the threshold
+        auto map_fn = cuda::proclaim_return_type<size_type>(
+          [begin = offsets_transformer_itr, strings_count = strings_count] (size_type idx) -> size_type {
+            return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
+          }
+        );
+
+        auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
+
+        thrust::get<1>(elem) = make_numeric_column(
+          data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+
+        auto d_offsets64 = (thrust::get<1>(elem))->mutable_view().template data<int64_t>();
+
+        cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
+
+        thrust::get<1>(elem) = std::move(thrust::get<1>(elem));
+      }
+    }
+  );
+
+  return std::pair(offsets_columns, host_total_bytes);
+}
+
+
 /**
  * @brief Create an offsets column to be a child of a compound column
  *