Merge branch 'branch-23.10' into feature/fix_encodings

rapidsai · Aug 26, 2023 · 0f41a7b · 0f41a7b
2 parents 1b94dc7 + b6d08ca
commit 0f41a7b
Show file tree

Hide file tree

Showing 36 changed files with 388 additions and 272 deletions.
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
@@ -107,8 +107,8 @@ constexpr uint32_t UTF8_3BYTE = 0x01'0000;
 struct codepoint_to_utf8_fn {
   cudf::column_device_view const d_strings;  // input strings
   uint32_t const* cp_data;                   // full code-point array
-  int32_t const* d_cp_offsets{};             // offsets to each string's code-point array
-  int32_t* d_offsets{};                      // offsets for the output strings
+  cudf::size_type const* d_cp_offsets{};     // offsets to each string's code-point array
+  cudf::size_type* d_offsets{};              // offsets for the output strings
   char* d_chars{};                           // buffer for the output strings column
 
   /**
@@ -118,15 +118,15 @@ struct codepoint_to_utf8_fn {
    * @param count number of code-points in `str_cps`
    * @return Number of bytes required for the output
    */
-  __device__ int32_t compute_output_size(uint32_t const* str_cps, uint32_t count)
+  __device__ cudf::size_type compute_output_size(uint32_t const* str_cps, uint32_t count)
   {
     return thrust::transform_reduce(
       thrust::seq,
       str_cps,
       str_cps + count,
       [](auto cp) { return 1 + (cp >= UTF8_1BYTE) + (cp >= UTF8_2BYTE) + (cp >= UTF8_3BYTE); },
       0,
-      thrust::plus<int32_t>());
+      thrust::plus());
   }
 
   __device__ void operator()(cudf::size_type idx)
@@ -208,9 +208,9 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
     auto const aux_table   = get_aux_codepoint_data(stream);
     auto const normalizer  = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
     auto const offsets     = strings.offsets();
-    auto const d_offsets   = offsets.data<uint32_t>() + strings.offset();
-    auto const offset      = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
-    auto const d_chars     = strings.chars().data<char>() + offset;
+    auto const d_offsets   = offsets.data<cudf::size_type>() + strings.offset();
+    auto const offset = cudf::detail::get_value<cudf::size_type>(offsets, strings.offset(), stream);
+    auto const d_chars = strings.chars().data<char>() + offset;
     return normalizer.normalize(d_chars, d_offsets, strings.size(), stream);
   }();
 
@@ -222,8 +222,8 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   // convert the result into a strings column
   // - the cp_chars are the new 4-byte code-point values for all the characters in the output
   // - the cp_offsets identify which code-points go with which strings
-  uint32_t const* cp_chars  = result.first->data();
-  int32_t const* cp_offsets = reinterpret_cast<int32_t const*>(result.second->data());
+  uint32_t const* cp_chars          = result.first->data();
+  cudf::size_type const* cp_offsets = result.second->data();
 
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 

diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
@@ -47,7 +47,7 @@ using replace_result = thrust::pair<bool, cudf::string_view>;
 struct base_token_replacer_fn {
   cudf::column_device_view const d_strings;  ///< strings to tokenize
   cudf::string_view const d_delimiter;       ///< delimiter characters for tokenizing
-  int32_t* d_offsets{};                      ///< for locating output string in d_chars
+  cudf::size_type* d_offsets{};              ///< for locating output string in d_chars
   char* d_chars{};                           ///< output buffer
 
   /**

diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -184,17 +184,19 @@ struct dispatch_is_letter_fn {
 struct porter_stemmer_measure_fn {
   cudf::column_device_view const d_strings;  // strings to measure
 
-  __device__ int32_t operator()(cudf::size_type idx) const
+  __device__ cudf::size_type operator()(cudf::size_type idx) const
   {
-    if (d_strings.is_null(idx)) return 0;
+    if (d_strings.is_null(idx)) { return 0; }
     cudf::string_view d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) return 0;
-    int32_t measure = 0;
-    auto itr        = d_str.begin();
-    bool vowel_run  = !is_consonant(itr);
+    if (d_str.empty()) { return 0; }
+
+    cudf::size_type measure = 0;
+
+    auto itr       = d_str.begin();
+    bool vowel_run = !is_consonant(itr);
     while (itr != d_str.end()) {
       if (is_consonant(itr)) {
-        if (vowel_run) measure++;
+        if (vowel_run) { measure++; }
         vowel_run = false;
       } else {
         vowel_run = true;
@@ -211,11 +213,13 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
                                                      rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
+  if (strings.is_empty()) {
+    return cudf::make_empty_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()});
+  }
 
   // create empty output column
   auto results =
-    cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                                   strings.size(),
                                   cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                                   strings.null_count(),
@@ -226,7 +230,7 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(strings.size()),
-                    results->mutable_view().data<int32_t>(),
+                    results->mutable_view().data<cudf::size_type>(),
                     porter_stemmer_measure_fn{*strings_column});
   results->set_null_count(strings.null_count());
   return results;

diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
@@ -124,9 +124,10 @@ __device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6)
  * @param start_byte_for_thread Which byte to start analyzing
  * @return New code point value for this byte.
  */
-__device__ uint32_t extract_code_points_from_utf8(unsigned char const* strings,
-                                                  size_t const total_bytes,
-                                                  uint32_t const start_byte_for_thread)
+__device__ uint32_t
+extract_code_points_from_utf8(unsigned char const* strings,
+                              size_t const total_bytes,
+                              cudf::thread_index_type const start_byte_for_thread)
 {
   constexpr uint8_t max_utf8_blocks_for_char    = 4;
   uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0};
@@ -214,8 +215,9 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,
   constexpr uint32_t init_val                     = (1 << FILTER_BIT);
   uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val};
 
-  uint32_t const char_for_thread = blockDim.x * blockIdx.x + threadIdx.x;
-  uint32_t num_new_chars         = 0;
+  cudf::thread_index_type const char_for_thread =
+    threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x);
+  uint32_t num_new_chars = 0;
 
   if (char_for_thread < total_bytes) {
     auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread);
@@ -273,31 +275,34 @@ data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata,
 }
 
 uvector_pair data_normalizer::normalize(char const* d_strings,
-                                        uint32_t const* d_offsets,
-                                        uint32_t num_strings,
+                                        cudf::size_type const* d_offsets,
+                                        cudf::size_type num_strings,
                                         rmm::cuda_stream_view stream) const
 {
-  if (num_strings == 0)
-    return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
-                     std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
+  if (num_strings == 0) {
+    return uvector_pair{std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
+                        std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream)};
+  }
 
   // copy offsets to working memory
-  size_t const num_offsets = num_strings + 1;
-  auto d_strings_offsets   = std::make_unique<rmm::device_uvector<uint32_t>>(num_offsets, stream);
+  auto const num_offsets = num_strings + 1;
+  auto d_strings_offsets =
+    std::make_unique<rmm::device_uvector<cudf::size_type>>(num_offsets, stream);
   thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<uint32_t>(0),
-                    thrust::make_counting_iterator<uint32_t>(num_offsets),
+                    thrust::counting_iterator<cudf::size_type>(0),
+                    thrust::counting_iterator<cudf::size_type>(num_offsets),
                     d_strings_offsets->begin(),
                     [d_offsets] __device__(auto idx) {
                       auto const offset = d_offsets[0];  // adjust for any offset to the offsets
                       return d_offsets[idx] - offset;
                     });
-  uint32_t const bytes_count = d_strings_offsets->element(num_strings, stream);
-  if (bytes_count == 0)  // if no bytes, nothing to do
-    return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
-                     std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
+  auto const bytes_count = d_strings_offsets->element(num_strings, stream);
+  if (bytes_count == 0) {  // if no bytes, nothing to do
+    return uvector_pair{std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
+                        std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream)};
+  }
 
-  cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(bytes_count), THREADS_PER_BLOCK, 1};
+  cudf::detail::grid_1d const grid{bytes_count, THREADS_PER_BLOCK, 1};
   size_t const threads_on_device  = grid.num_threads_per_block * grid.num_blocks;
   size_t const max_new_char_total = MAX_NEW_CHARS * threads_on_device;
 
@@ -333,7 +338,7 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
     num_strings,
     update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()});
 
-  uint32_t const num_chars = d_strings_offsets->element(num_strings, stream);
+  auto const num_chars = d_strings_offsets->element(num_strings, stream);
   d_code_points->resize(num_chars, stream);  // should be smaller than original allocated size
 
   // return the normalized code points and the new offsets

diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,11 +18,13 @@
 
 #include <text/subword/detail/cp_data.h>
 
+#include <cudf/types.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
 using uvector_pair = std::pair<std::unique_ptr<rmm::device_uvector<uint32_t>>,
-                               std::unique_ptr<rmm::device_uvector<uint32_t>>>;
+                               std::unique_ptr<rmm::device_uvector<cudf::size_type>>>;
 
 namespace nvtext {
 namespace detail {
@@ -85,8 +87,8 @@ class data_normalizer {
    *         used to locate the code points for each string.
    */
   uvector_pair normalize(char const* d_strings,
-                         uint32_t const* d_offsets,
-                         uint32_t num_strings,
+                         cudf::size_type const* d_offsets,
+                         cudf::size_type num_strings,
                          rmm::cuda_stream_view stream) const;
 
  private:

diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <text/subword/detail/cp_data.h>
 
+#include <cudf/types.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -47,8 +49,9 @@ constexpr int THREADS_PER_BLOCK = 64;
  */
 struct update_strings_lengths_fn {
   uint32_t const* d_chars_up_to_idx;
-  uint32_t* d_offsets;
-  __device__ void operator()(uint32_t idx)
+  cudf::size_type* d_offsets;
+
+  __device__ void operator()(cudf::size_type idx)
   {
     auto const offset = d_offsets[idx];
     d_offsets[idx]    = offset > 0 ? d_chars_up_to_idx[offset - 1] : 0;

diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
@@ -82,8 +82,8 @@ class wordpiece_tokenizer {
    * @return Pointer to token-ids and token-id offsets
    */
   uvector_pair tokenize(char const* d_strings,
-                        uint32_t const* d_offsets,
-                        uint32_t num_strings,
+                        cudf::size_type const* d_offsets,
+                        cudf::size_type num_strings,
                         rmm::cuda_stream_view stream);
 
  private:

diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
@@ -59,7 +59,7 @@ namespace {
 __global__ void kernel_compute_tensor_metadata(
   // input
   uint32_t const* token_ids,
-  uint32_t const* offsets,
+  cudf::size_type const* offsets,
   uint32_t const* row2tensor,
   uint32_t const* row2row_within_tensor,
   uint32_t max_sequence_length,
@@ -71,8 +71,13 @@ __global__ void kernel_compute_tensor_metadata(
   uint32_t* attn_mask,
   uint32_t* metadata)
 {
-  uint32_t const output_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (output_idx >= (nrows_tensor_token_ids * max_sequence_length)) return;
+  cudf::thread_index_type const output_idx =
+    threadIdx.x + static_cast<cudf::thread_index_type>(blockIdx.x) *
+                    static_cast<cudf::thread_index_type>(blockDim.x);
+  if (output_idx >= (static_cast<cudf::thread_index_type>(nrows_tensor_token_ids) *
+                     static_cast<cudf::thread_index_type>(max_sequence_length))) {
+    return;
+  }
 
   uint32_t const absolute_row_id         = output_idx / max_sequence_length;
   uint32_t const tensor_id               = row2tensor[absolute_row_id];
@@ -179,18 +184,18 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
     std::overflow_error);
 
   auto const offsets   = strings.offsets();
-  auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
-  auto const offset    = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
-  auto const d_chars   = strings.chars().data<char>() + offset;
+  auto const d_offsets = offsets.data<cudf::size_type>() + strings.offset();
+  auto const offset  = cudf::detail::get_value<cudf::size_type>(offsets, strings.offset(), stream);
+  auto const d_chars = strings.chars().data<char>() + offset;
 
   // Create tokenizer
   wordpiece_tokenizer tokenizer(
     vocab_table, max_sequence_length, stride, do_truncate, do_lower_case);
   // Run tokenizer
   auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
   // assign output components
-  uint32_t const* device_token_ids = tokens.first->data();
-  uint32_t const* device_offsets   = tokens.second->data();
+  auto device_token_ids = tokens.first->data();
+  auto device_offsets   = tokens.second->data();
 
   // Format output from tokenizer
   // Each string can create 1 or more tensor entries.

diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -82,7 +82,9 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
                                                        uint32_t* token_ids,
                                                        uint8_t* tokens_per_word)
 {
-  uint32_t char_for_thread = blockDim.x * blockIdx.x + threadIdx.x;
+  cudf::thread_index_type char_for_thread = static_cast<cudf::thread_index_type>(blockDim.x) *
+                                              static_cast<cudf::thread_index_type>(blockIdx.x) +
+                                            threadIdx.x;
 
   // Deal with the start_word_indices array
   if (char_for_thread < num_code_points) {
@@ -130,12 +132,14 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
  * @param num_strings The total number of strings to be processed.
  */
 __global__ void mark_string_start_and_ends(uint32_t const* code_points,
-                                           uint32_t const* strings_offsets,
+                                           cudf::size_type const* strings_offsets,
                                            uint32_t* start_word_indices,
                                            uint32_t* end_word_indices,
                                            uint32_t num_strings)
 {
-  uint32_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  cudf::thread_index_type idx = static_cast<cudf::thread_index_type>(blockDim.x) *
+                                  static_cast<cudf::thread_index_type>(blockIdx.x) +
+                                threadIdx.x;
   // Ensure the starting character of each strings is written to the word start array.
   if (idx <= num_strings) {
     auto const offset = strings_offsets[idx];
@@ -330,7 +334,9 @@ __global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points,
                                            uint32_t* token_ids,
                                            uint8_t* tokens_per_word)
 {
-  uint32_t const word_to_tokenize = blockDim.x * blockIdx.x + threadIdx.x;
+  cudf::thread_index_type word_to_tokenize = static_cast<cudf::thread_index_type>(blockDim.x) *
+                                               static_cast<cudf::thread_index_type>(blockIdx.x) +
+                                             threadIdx.x;
 
   if (word_to_tokenize >= total_words) return;
   // Each thread gets the start code_point offset for each word and resets the token_id memory to
@@ -414,8 +420,8 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
 }
 
 uvector_pair wordpiece_tokenizer::tokenize(char const* d_strings,
-                                           uint32_t const* d_offsets,
-                                           uint32_t num_strings,
+                                           cudf::size_type const* d_offsets,
+                                           cudf::size_type num_strings,
                                            rmm::cuda_stream_view stream)
 {
   auto cps_and_offsets = normalizer.normalize(d_strings, d_offsets, num_strings, stream);
@@ -433,10 +439,10 @@ struct tranform_fn {  // just converting uint8 value to uint32
 
 void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream)
 {
-  uint32_t* device_code_points     = cps_and_offsets.first->data();
-  size_t const num_code_points     = cps_and_offsets.first->size();
-  uint32_t* device_strings_offsets = cps_and_offsets.second->data();
-  uint32_t const num_strings       = cps_and_offsets.second->size() - 1;
+  auto device_code_points     = cps_and_offsets.first->data();
+  auto const num_code_points  = cps_and_offsets.first->size();
+  auto device_strings_offsets = cps_and_offsets.second->data();
+  auto const num_strings      = cps_and_offsets.second->size() - 1;
 
   size_t const four_byte_cp_chunks = 1 + (num_code_points - 1) / sizeof(uint32_t);
   size_t const rounded_num_cps     = sizeof(uint32_t) * four_byte_cp_chunks;