rapidsai · sdrp713 · Aug 2, 2024 · Aug 2, 2024 · Aug 2, 2024 · Aug 5, 2024
@@ -339,6 +339,7 @@ ConfigureNVBench(
   string/join_strings.cpp
   string/lengths.cpp
   string/like.cpp
+  string/make_strings_column.cu
   string/replace_re.cpp
   string/reverse.cpp
   string/split.cpp

@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/pair.h>
+#include <thrust/tabulate.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+static void BM_make_strings_column(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const has_nulls = static_cast<bool>(state.get_int64("has_nulls"));
+
+  constexpr int min_row_width = 0;
+  constexpr int max_row_width = 50;
+  data_profile const table_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
+      .null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
+  auto const data_table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+
+  using string_index_pair = thrust::pair<char const*, cudf::size_type>;
+  auto const stream       = cudf::get_default_stream();
+  auto input              = rmm::device_uvector<string_index_pair>(data_table->num_rows(), stream);
+  auto const d_data_ptr =
+    cudf::column_device_view::create(data_table->get_column(0).view(), stream);
+  thrust::tabulate(rmm::exec_policy(stream),
+                   input.begin(),
+                   input.end(),
+                   [data_col = *d_data_ptr] __device__(auto const idx) {
+                     auto const row = data_col.element<cudf::string_view>(idx);
+                     return string_index_pair{row.data(), row.size_bytes()};
+                   });
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto const output = cudf::make_strings_column(input, stream);
+  });
+}
+
+static void BM_make_strings_column_batch(nvbench::state& state)
+{
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const has_nulls  = static_cast<bool>(state.get_int64("has_nulls"));
+  auto const batch_size = static_cast<cudf::size_type>(state.get_int64("batch_size"));
+
+  constexpr int min_row_width = 0;
+  constexpr int max_row_width = 50;
+  data_profile const table_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
+      .null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
+  auto const data_table = create_random_table(
+    cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile);
+
+  using string_index_pair = thrust::pair<char const*, cudf::size_type>;
+  auto const stream       = cudf::get_default_stream();
+  auto input_data         = std::vector<rmm::device_uvector<string_index_pair>>{};
+  auto input              = std::vector<cudf::device_span<string_index_pair const>>{};
+  input_data.reserve(batch_size);
+  input.reserve(batch_size);
+  for (auto i = 0; i < batch_size; ++i) {
+    auto const d_data_ptr =
+      cudf::column_device_view::create(data_table->get_column(i).view(), stream);
+    auto batch_input = rmm::device_uvector<string_index_pair>(data_table->num_rows(), stream);
+    thrust::tabulate(rmm::exec_policy(stream),
+                     batch_input.begin(),
+                     batch_input.end(),
+                     [data_col = *d_data_ptr] __device__(auto const idx) {
+                       auto const row = data_col.element<cudf::string_view>(idx);
+                       return string_index_pair{row.data(), row.size_bytes()};
+                     });
+    input_data.emplace_back(std::move(batch_input));
+    input.emplace_back(input_data.back());
+  }
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto const output = cudf::make_strings_column_batch(input, stream);
+  });
+}
+
+NVBENCH_BENCH(BM_make_strings_column)
+  .set_name("make_strings_column")
+  .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000, 200'000'000})
+  .add_int64_axis("has_nulls", {0, 1});
+
+NVBENCH_BENCH(BM_make_strings_column_batch)
+  .set_name("make_strings_column_batch")
+  .add_int64_axis("num_rows", {1'000'000, 10'000'000, 20'000'000})
+  .add_int64_axis("has_nulls", {0, 1})
+  .add_int64_axis("batch_size", {5, 10, 15, 20});
@@ -379,6 +379,18 @@ std::unique_ptr<column> make_strings_column(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief TODO
+ * @param strings_batch
+ * @param stream
+ * @param mr
+ * @return
+ */
+std::vector<std::unique_ptr<column>> make_strings_column_batch(
+  std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> strings_batch,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Construct a STRING type column given a device span of string_view.
  *

@@ -226,6 +226,28 @@ static sizes_to_offsets_iterator<ScanIterator, LastType> make_sizes_to_offsets_i
   return sizes_to_offsets_iterator<ScanIterator, LastType>(begin, end, last);
 }
 
+
+template <typename SizesIterator, typename OffsetsIterator>
+auto sizes_to_offsets_batch(SizesIterator begin,
+                      SizesIterator end,
+                      OffsetsIterator result,
+                      rmm::cuda_stream_view stream)
+{
+  using SizeType = typename thrust::iterator_traits<SizesIterator>::value_type;
+  static_assert(std::is_integral_v<SizeType>,
+                "Only numeric types are supported by sizes_to_offsets");
+
+  using LastType    = std::conditional_t<std::is_signed_v<SizeType>, int64_t, uint64_t>;
+  auto last_element = rmm::device_scalar<LastType>(0, stream);
+  auto output_itr =
+    make_sizes_to_offsets_iterator(result, result + std::distance(begin, end), last_element.data());
+  // This function uses the type of the initialization parameter as the accumulator type
+  // when computing the individual scan output elements.
+  thrust::exclusive_scan(rmm::exec_policy(stream), begin, end, output_itr, LastType{0});
+  return last_element;
+}
+
+
 /**
  * @brief Perform an exclusive-scan and capture the final element value
  *

@@ -188,5 +188,47 @@ CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1,
   }
 }
 
+template <typename InputIterator, typename Predicate>
+std::pair<std::vector<rmm::device_buffer>, std::vector<size_type>> valid_if_n_kernel(std::vector<InputIterator> strings,
+                                                  std::vector<int64_t> sizes,
+                                                  Predicate p,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::device_async_resource_ref mr)
+{
+  rmm::device_uvector<size_type> valid_counts(strings.size(), stream);
+
+  rmm::device_uvector<rmm::device_buffer> null_masks(strings.size(), stream);
+
+  thrust::transform(
+    sizes.begin(),
+    sizes.end(),
+    null_masks.begin(),
+    [stream, mr] __device__ (auto & size) {
+      return cudf::create_null_mask(size, mask_state::UNINITIALIZED, stream, mr);
+    }
+  );
+  constexpr size_type block_size{256};
+  grid_1d grid{strings.size(), block_size};
+  valid_if_n_kernel<block_size><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>
+  (strings.begin(), InputIterator2 begin2, p, static_cast<bitmask_type*>(null_masks.data()),  strings.size(),  8, valid_counts.data());
+
+  std::vector<size_type> host_valid_counts = make_std_vector_async(valid_counts, stream);
+
+  std::vector<size_type> host_null_masks = make_std_vector_async(null_masks, stream);
+
+  std::vector<size_type> null_counts(strings.size());
+
+  std::transform(
+    thrust::make_zip_iterator(thrust::make_tuple(host_valid_counts.begin(), sizes.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(host_valid_counts.end(), sizes.end())),
+    null_counts.begin(),
+    [] (auto & elem) {
+      return thrust::get<1>(elem) - thrust::get<0>(elem);
+    }
+  )
+
+  return std::pair(host_null_masks, null_counts);
+}
+
 }  // namespace detail
 }  // namespace cudf
@@ -38,6 +38,73 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+std::pair<std::vector<std::unique_ptr<column>>, std::vector<int64_t>> make_offsets_child_column_batch(
+  std::vector<thrust::transform_iterator<size_type> offsets_transformer_itr,
+  std::vector<size_type> strings_sizes,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  std::vector<std::unique_ptr<column>> offsets_columns;
+  rmm::device_uvector<rmm::device_scalar<int64_t>> total_bytes(offsets_transformer_itr.size(), stream, mr);
+  auto constexpr size_type_max = static_cast<int64_t>(std::numeric_limits<size_type>::max());
+
+  std::transform (
+    thrust::make_zip_iterator(thrust::make_tuple(offsets_transformer_itr.begin(), strings_sizes.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(offsets_transformer_itr.end(), strings_sizes.end())),
+    std::back_inserter(offsets_columns),
+    [stream, mr] (auto &elem) {
+      auto const lcount = static_cast<int64_t>(thrust::get<1>(elem));
+      auto const strings_count = static_cast<size_type>(lcount);
+      return make_numeric_column(data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+    }
+  );
+
+  std::transform (
+    thrust::make_zip_iterator(thrust::make_tuple(offsets_transformer_itr.begin(), strings_sizes.begin(), offsets_columns.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(offsets_transformer_itr.end(), strings_sizes.end(), offsets_columns.end())),
+    std::back_inserter(total_bytes),
+    [] (auto &elem) {
+      auto d_offsets = thrust::get<2>(elem)->mutable_view().template data<int32_t>();
+      auto map_fn = cuda::proclaim_return_type<size_type>(
+        [begin = thrust::get<0>(elem), strings_count = thrust::get<1>(elem)] __device__(size_type idx) -> size_type {
+          return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
+        }
+      );
+      auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
+      return cudf::detail::sizes_to_offsets_batch(input_itr, input_itr + thrust::get<1>(elem) + 1, d_offsets, stream);
+    }
+  );
+
+  auto host_total_bytes = make_std_vector_async(total_bytes, stream);
+
+  auto const threshold = cudf::strings::get_offset64_threshold();
+  std::for_each (
+    thrust::make_zip_iterator(thrust::make_tuple(host_total_bytes.begin(), strings_sizes.begin(), offsets_columns.begin(), offsets_transformer_itr.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(host_total_bytes.end(), strings_sizes.end(), offsets_columns.end(), offsets_transformer_itr.end())),
+    [threshold, stream, mr] (auto &elem) {
+      CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || (thrust::get<0>(elem) < threshold),
+              "Size of output exceeds the column size limit",
+              std::overflow_error);
+      if (thrust::get<0>(elem) >= cudf::strings::get_offset64_threshold()) {
+        // recompute as int64 offsets when above the threshold
+        auto map_fn = cuda::proclaim_return_type<size_type>(
+          [begin = thrust::get<3>(elem), strings_count = thrust::get<1>(elem)] __device__(size_type idx) -> size_type {
+            return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
+          }
+        );
+        auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
+        *(thrust::get<2>(elem)) = make_numeric_column(
+          data_type{type_id::INT64}, thrust::get<1>(elem) + 1, mask_state::UNALLOCATED, stream, mr);
+        auto d_offsets64 = (*(thrust::get<2>(elem)))->mutable_view().template data<int64_t>();
+        cudf::detail::sizes_to_offsets(input_itr, input_itr + thrust::get<1>(elem) + 1, d_offsets64, stream);
+      }
+    }
+  );
+
+  return std::pair(offsets_column, total_bytes);
+}
+
+
 /**
  * @brief Create an offsets column to be a child of a compound column
  *