Remove unused max_rows_tensor parameter from subword tokenizer (#13463)

Removes the `max_rows_tensor` parameter is from the `nvtext::subword_tokenize` API since it is no longer required. The parameter was intended to size the temporary working memory for the internal functions. Since some general rework it was no longer used but never removed from the API. Also updates the Python/Cython calls which had been hard-coding a default value anyway. Reference issue #13458 found this issue. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Divye Gala (https://github.com/divyegala) - Vyas Ramasubramani (https://github.com/vyasr) - Matthew Roeschke (https://github.com/mroeschke) URL: #13463
rapidsai · Jun 1, 2023 · ebb5875 · ebb5875
1 parent 8a70802
commit ebb5875
Show file tree

Hide file tree

Showing 9 changed files with 39 additions and 73 deletions.
diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
@@ -27,8 +27,6 @@
 #include <iostream>
 #include <vector>
 
-#define MAX_ROWS_TENSOR 300
-
 static std::string create_hash_vocab_file()
 {
   std::string dir_template{std::filesystem::temp_directory_path().string()};
@@ -74,8 +72,7 @@ static void BM_subword_tokenizer(benchmark::State& state)
                                            max_sequence_length,
                                            stride,
                                            do_lower,
-                                           do_truncate,
-                                           MAX_ROWS_TENSOR);
+                                           do_truncate);
   }
 }
 

diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
@@ -145,10 +145,6 @@ struct tokenizer_result {
  * @param do_truncate If true, the tokenizer will discard all the token-ids after
  *        `max_sequence_length` for each input string. If false, it will use a new row
  *        in the output token-ids to continue generating the output.
- * @param max_rows_tensor Maximum number of rows for the output token-ids expected
- *        to be generated by the tokenizer.
- *        Used for allocating temporary working memory on the GPU device.
- *        If the output generates a larger number of rows, behavior is undefined.
  * @param mr Memory resource to allocate any returned objects.
  * @return token-ids, attention-mask, and metadata
  */
@@ -159,7 +155,6 @@ tokenizer_result subword_tokenize(
   uint32_t stride,
   bool do_lower_case,
   bool do_truncate,
-  uint32_t max_rows_tensor,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group

diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,9 +49,6 @@ class wordpiece_tokenizer {
    * @brief Creates a full tokenizer that cleans the text and splits it into tokens.
    *
    * @param vocab_table The preprocessed hashed vocabulary data.
-   * @param max_rows_final_tensor Maximum number of rows in tensor_token-ids expected by tokenizer.
-   *        Used to allocate temporary working memory on the GPU.
-   *        If the output contains a larger number of rows, behavior is undefined.
    * @param max_sequence_length Limit the number of token-ids per row in the output
    * @param stride Each row in tensor-token-ids will replicate `max_sequence_length - stride`
    *        token-ids from the previous row, unless it is the first string.
@@ -66,7 +63,6 @@ class wordpiece_tokenizer {
    *        specified in the `vocab_file`.
    */
   wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
-                      uint32_t max_rows_final_tensor,
                       uint32_t max_sequence_length,
                       uint32_t stride,
                       bool do_truncate,

diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
@@ -159,17 +159,11 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   uint32_t stride,
                                   bool do_lower_case,
                                   bool do_truncate,
-                                  uint32_t max_rows_tensor,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(stride <= max_sequence_length,
                "stride must be less than or equal to max_sequence_length");
-  CUDF_EXPECTS(
-    max_sequence_length <=
-      (static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()) / max_rows_tensor),
-    "max_sequence_length times max_rows_tensor exceeds the column size limit",
-    std::overflow_error);
   auto const strings_count = strings.size();
   if (strings_count == strings.null_count()) {  // empty or all-null returns empty
     return tokenizer_result{0,
@@ -178,6 +172,11 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                             cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}),
                             cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32})};
   }
+  CUDF_EXPECTS(
+    max_sequence_length <=
+      (static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()) / strings_count),
+    "max_sequence_length times number of input rows exceeds the column size limit",
+    std::overflow_error);
 
   auto const offsets   = strings.offsets();
   auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
@@ -186,7 +185,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
 
   // Create tokenizer
   wordpiece_tokenizer tokenizer(
-    vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case);
+    vocab_table, max_sequence_length, stride, do_truncate, do_lower_case);
   // Run tokenizer
   auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
   // assign output components
@@ -292,7 +291,6 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   uint32_t stride,
                                   bool do_lower_case,
                                   bool do_truncate,
-                                  uint32_t max_rows_tensor,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -302,7 +300,6 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   stride,
                                   do_lower_case,
                                   do_truncate,
-                                  max_rows_tensor,
                                   cudf::get_default_stream(),
                                   mr);
 }

diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -397,7 +397,6 @@ __global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points,
 }  // namespace
 
 wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
-                                         uint32_t max_rows_final_tensor,
                                          uint32_t max_sequence_length,
                                          uint32_t stride,
                                          bool do_truncate,

diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
@@ -28,8 +28,6 @@
 #include <iostream>
 #include <vector>
 
-#define MAX_ROWS_TENSOR 300
-
 // Global environment for temporary files
 auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
   ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
@@ -75,9 +73,8 @@ TEST(TextSubwordTest, Tokenize)
                                          *vocab,
                                          max_sequence_length,
                                          stride,
-                                         true,   // do_lower_case
-                                         false,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,    // do_lower_case
+                                         false);  // do_truncate
 
   EXPECT_EQ(nrows, result.nrows_tensor);
 
@@ -128,9 +125,8 @@ TEST(TextSubwordTest, TokenizeMultiRow)
                                          *vocab,
                                          max_sequence_length,
                                          stride,
-                                         true,   // do_lower_case
-                                         false,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,    // do_lower_case
+                                         false);  // do_truncate
 
   EXPECT_EQ(uint32_t{3}, result.nrows_tensor);
   cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens(
@@ -159,8 +155,7 @@ TEST(TextSubwordTest, TokenizeWithEmptyRow)
   bool const lower       = true;
   bool const truncate    = false;
 
-  auto result =
-    nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, MAX_ROWS_TENSOR);
+  auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate);
 
   EXPECT_EQ(uint32_t{4}, result.nrows_tensor);
 
@@ -201,9 +196,8 @@ TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
                                          *vocab,
                                          max_sequence_length,
                                          stride,
-                                         true,   // do_lower_case
-                                         false,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,    // do_lower_case
+                                         false);  // do_truncate
 
   EXPECT_EQ(uint32_t{1}, result.nrows_tensor);
   cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens({2023, 2003, 1037, 3231, 1012});
@@ -216,28 +210,26 @@ TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
 
 TEST(TextSubwordTest, ParameterErrors)
 {
-  std::vector<const char*> h_strings{"This is a test.", "This is a test. This is a tést."};
+  std::vector<const char*> h_strings{"This is a test.", "This is a test. This is a tést.", "", ""};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
   auto vocab = nvtext::load_vocabulary_file(hash_file);
 
   EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings},
                                         *vocab,
-                                        12,    // max_sequence_length
-                                        13,    // stride <= max_sequence_length
-                                        true,  // do_lower_case
-                                        true,  // do_truncate
-                                        MAX_ROWS_TENSOR),
+                                        12,     // max_sequence_length
+                                        13,     // stride <= max_sequence_length
+                                        true,   // do_lower_case
+                                        true),  // do_truncate
                cudf::logic_error);
 
   EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings},
                                         *vocab,
+                                        858993459,
                                         5,
-                                        5,
-                                        true,  // do_lower_case
-                                        true,  // do_truncate
-                                        858993459),
+                                        true,   // do_lower_case
+                                        true),  // do_truncate
                std::overflow_error);
 }
 
@@ -251,9 +243,8 @@ TEST(TextSubwordTest, EmptyStrings)
                                          *vocab,
                                          16,
                                          16,
-                                         true,   // do_lower_case
-                                         false,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,    // do_lower_case
+                                         false);  // do_truncate
   EXPECT_EQ(uint32_t{0}, result.nrows_tensor);
   EXPECT_EQ(0, result.tensor_token_ids->size());
   EXPECT_EQ(0, result.tensor_attention_mask->size());
@@ -270,9 +261,8 @@ TEST(TextSubwordTest, AllNullStrings)
                                          *vocab,
                                          16,
                                          16,
-                                         true,   // do_lower_case
-                                         false,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,    // do_lower_case
+                                         false);  // do_truncate
   EXPECT_EQ(uint32_t{0}, result.nrows_tensor);
   EXPECT_EQ(0, result.tensor_token_ids->size());
   EXPECT_EQ(0, result.tensor_attention_mask->size());
@@ -293,7 +283,7 @@ TEST(TextSubwordTest, NoTokens)
   bool const lower       = true;
   bool const truncate    = true;
 
-  auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, 2);
+  auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate);
 
   std::vector<uint32_t> zeros(max_seq * input.size(), 0);
 
@@ -319,9 +309,8 @@ TEST(TextSubwordTest, TokenizeFromVocabStruct)
                                          *vocab,
                                          8,
                                          6,
-                                         true,  // do_lower_case
-                                         true,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,   // do_lower_case
+                                         true);  // do_truncate
 
   EXPECT_EQ(uint32_t{2}, result.nrows_tensor);
   cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens(
@@ -389,9 +378,8 @@ TEST(TextSubwordTest, TokenizeWithSpecialTokens)
                                          *vocab,
                                          8,
                                          6,
-                                         true,  // do_lower_case
-                                         true,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,   // do_lower_case
+                                         true);  // do_truncate
 
   EXPECT_EQ(static_cast<uint32_t>(h_strings.size()), result.nrows_tensor);
   // clang-format off
@@ -439,9 +427,8 @@ TEST(TextSubwordTest, ZeroHashBinCoefficient)
                                          *vocab,
                                          8,
                                          8,
-                                         true,  // do_lower_case
-                                         true,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,   // do_lower_case
+                                         true);  // do_truncate
 
   // clang-format off
   cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens({7, 0, 0, 0, 0, 0, 0, 0});

diff --git a/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint16_t, uint32_t
 from libcpp cimport bool
@@ -38,8 +38,7 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
         uint32_t max_sequence_length,
         uint32_t stride,
         bool do_lower,
-        bool do_truncate,
-        uint32_t max_rows_tensor
+        bool do_truncate
     ) except +
 
     cdef tokenizer_result subword_tokenize(
@@ -48,8 +47,7 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
         uint32_t max_sequence_length,
         uint32_t stride,
         bool do_lower,
-        bool do_truncate,
-        uint32_t max_rows_tensor
+        bool do_truncate
     ) except +
 
 cdef extern from "<utility>" namespace "std" nogil:

diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t
 
@@ -37,7 +37,6 @@ def subword_tokenize_inmem_hash(
     uint32_t stride=48,
     bool do_lower=True,
     bool do_truncate=False,
-    uint32_t max_rows_tensor=500
 ):
     """
     Subword tokenizes text series by using the pre-loaded hashed vocabulary
@@ -53,7 +52,6 @@ def subword_tokenize_inmem_hash(
                 stride,
                 do_lower,
                 do_truncate,
-                max_rows_tensor
             )
         )
     # return the 3 tensor components

diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -216,7 +216,6 @@ def __call__(
             stride=stride,
             do_lower=self.do_lower_case,
             do_truncate=truncation,
-            max_rows_tensor=max_num_rows,
         )
 
         tokenizer_output = {