Skip to content

Commit

Permalink
Merge branch 'branch-23.10' into feature/fix_encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
ttnghia authored Aug 26, 2023
2 parents 1b94dc7 + b6d08ca commit 0f41a7b
Show file tree
Hide file tree
Showing 36 changed files with 388 additions and 272 deletions.
18 changes: 9 additions & 9 deletions cpp/src/text/normalize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ constexpr uint32_t UTF8_3BYTE = 0x01'0000;
struct codepoint_to_utf8_fn {
cudf::column_device_view const d_strings; // input strings
uint32_t const* cp_data; // full code-point array
int32_t const* d_cp_offsets{}; // offsets to each string's code-point array
int32_t* d_offsets{}; // offsets for the output strings
cudf::size_type const* d_cp_offsets{}; // offsets to each string's code-point array
cudf::size_type* d_offsets{}; // offsets for the output strings
char* d_chars{}; // buffer for the output strings column

/**
Expand All @@ -118,15 +118,15 @@ struct codepoint_to_utf8_fn {
* @param count number of code-points in `str_cps`
* @return Number of bytes required for the output
*/
__device__ int32_t compute_output_size(uint32_t const* str_cps, uint32_t count)
__device__ cudf::size_type compute_output_size(uint32_t const* str_cps, uint32_t count)
{
return thrust::transform_reduce(
thrust::seq,
str_cps,
str_cps + count,
[](auto cp) { return 1 + (cp >= UTF8_1BYTE) + (cp >= UTF8_2BYTE) + (cp >= UTF8_3BYTE); },
0,
thrust::plus<int32_t>());
thrust::plus());
}

__device__ void operator()(cudf::size_type idx)
Expand Down Expand Up @@ -208,9 +208,9 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
auto const aux_table = get_aux_codepoint_data(stream);
auto const normalizer = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
auto const offsets = strings.offsets();
auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
auto const offset = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
auto const d_chars = strings.chars().data<char>() + offset;
auto const d_offsets = offsets.data<cudf::size_type>() + strings.offset();
auto const offset = cudf::detail::get_value<cudf::size_type>(offsets, strings.offset(), stream);
auto const d_chars = strings.chars().data<char>() + offset;
return normalizer.normalize(d_chars, d_offsets, strings.size(), stream);
}();

Expand All @@ -222,8 +222,8 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
// convert the result into a strings column
// - the cp_chars are the new 4-byte code-point values for all the characters in the output
// - the cp_offsets identify which code-points go with which strings
uint32_t const* cp_chars = result.first->data();
int32_t const* cp_offsets = reinterpret_cast<int32_t const*>(result.second->data());
uint32_t const* cp_chars = result.first->data();
cudf::size_type const* cp_offsets = result.second->data();

auto d_strings = cudf::column_device_view::create(strings.parent(), stream);

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/text/replace.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ using replace_result = thrust::pair<bool, cudf::string_view>;
struct base_token_replacer_fn {
cudf::column_device_view const d_strings; ///< strings to tokenize
cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing
int32_t* d_offsets{}; ///< for locating output string in d_chars
cudf::size_type* d_offsets{}; ///< for locating output string in d_chars
char* d_chars{}; ///< output buffer

/**
Expand Down
26 changes: 15 additions & 11 deletions cpp/src/text/stemmer.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -184,17 +184,19 @@ struct dispatch_is_letter_fn {
struct porter_stemmer_measure_fn {
cudf::column_device_view const d_strings; // strings to measure

__device__ int32_t operator()(cudf::size_type idx) const
__device__ cudf::size_type operator()(cudf::size_type idx) const
{
if (d_strings.is_null(idx)) return 0;
if (d_strings.is_null(idx)) { return 0; }
cudf::string_view d_str = d_strings.element<cudf::string_view>(idx);
if (d_str.empty()) return 0;
int32_t measure = 0;
auto itr = d_str.begin();
bool vowel_run = !is_consonant(itr);
if (d_str.empty()) { return 0; }

cudf::size_type measure = 0;

auto itr = d_str.begin();
bool vowel_run = !is_consonant(itr);
while (itr != d_str.end()) {
if (is_consonant(itr)) {
if (vowel_run) measure++;
if (vowel_run) { measure++; }
vowel_run = false;
} else {
vowel_run = true;
Expand All @@ -211,11 +213,13 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
if (strings.is_empty()) {
return cudf::make_empty_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()});
}

// create empty output column
auto results =
cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
strings.size(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
strings.null_count(),
Expand All @@ -226,7 +230,7 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings.size()),
results->mutable_view().data<int32_t>(),
results->mutable_view().data<cudf::size_type>(),
porter_stemmer_measure_fn{*strings_column});
results->set_null_count(strings.null_count());
return results;
Expand Down
45 changes: 25 additions & 20 deletions cpp/src/text/subword/data_normalizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,10 @@ __device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6)
* @param start_byte_for_thread Which byte to start analyzing
* @return New code point value for this byte.
*/
__device__ uint32_t extract_code_points_from_utf8(unsigned char const* strings,
size_t const total_bytes,
uint32_t const start_byte_for_thread)
__device__ uint32_t
extract_code_points_from_utf8(unsigned char const* strings,
size_t const total_bytes,
cudf::thread_index_type const start_byte_for_thread)
{
constexpr uint8_t max_utf8_blocks_for_char = 4;
uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0};
Expand Down Expand Up @@ -214,8 +215,9 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,
constexpr uint32_t init_val = (1 << FILTER_BIT);
uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val};

uint32_t const char_for_thread = blockDim.x * blockIdx.x + threadIdx.x;
uint32_t num_new_chars = 0;
cudf::thread_index_type const char_for_thread =
threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x);
uint32_t num_new_chars = 0;

if (char_for_thread < total_bytes) {
auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread);
Expand Down Expand Up @@ -273,31 +275,34 @@ data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata,
}

uvector_pair data_normalizer::normalize(char const* d_strings,
uint32_t const* d_offsets,
uint32_t num_strings,
cudf::size_type const* d_offsets,
cudf::size_type num_strings,
rmm::cuda_stream_view stream) const
{
if (num_strings == 0)
return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
if (num_strings == 0) {
return uvector_pair{std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream)};
}

// copy offsets to working memory
size_t const num_offsets = num_strings + 1;
auto d_strings_offsets = std::make_unique<rmm::device_uvector<uint32_t>>(num_offsets, stream);
auto const num_offsets = num_strings + 1;
auto d_strings_offsets =
std::make_unique<rmm::device_uvector<cudf::size_type>>(num_offsets, stream);
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<uint32_t>(0),
thrust::make_counting_iterator<uint32_t>(num_offsets),
thrust::counting_iterator<cudf::size_type>(0),
thrust::counting_iterator<cudf::size_type>(num_offsets),
d_strings_offsets->begin(),
[d_offsets] __device__(auto idx) {
auto const offset = d_offsets[0]; // adjust for any offset to the offsets
return d_offsets[idx] - offset;
});
uint32_t const bytes_count = d_strings_offsets->element(num_strings, stream);
if (bytes_count == 0) // if no bytes, nothing to do
return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
auto const bytes_count = d_strings_offsets->element(num_strings, stream);
if (bytes_count == 0) { // if no bytes, nothing to do
return uvector_pair{std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream)};
}

cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(bytes_count), THREADS_PER_BLOCK, 1};
cudf::detail::grid_1d const grid{bytes_count, THREADS_PER_BLOCK, 1};
size_t const threads_on_device = grid.num_threads_per_block * grid.num_blocks;
size_t const max_new_char_total = MAX_NEW_CHARS * threads_on_device;

Expand Down Expand Up @@ -333,7 +338,7 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
num_strings,
update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()});

uint32_t const num_chars = d_strings_offsets->element(num_strings, stream);
auto const num_chars = d_strings_offsets->element(num_strings, stream);
d_code_points->resize(num_chars, stream); // should be smaller than original allocated size

// return the normalized code points and the new offsets
Expand Down
10 changes: 6 additions & 4 deletions cpp/src/text/subword/detail/data_normalizer.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,11 +18,13 @@

#include <text/subword/detail/cp_data.h>

#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

using uvector_pair = std::pair<std::unique_ptr<rmm::device_uvector<uint32_t>>,
std::unique_ptr<rmm::device_uvector<uint32_t>>>;
std::unique_ptr<rmm::device_uvector<cudf::size_type>>>;

namespace nvtext {
namespace detail {
Expand Down Expand Up @@ -85,8 +87,8 @@ class data_normalizer {
* used to locate the code points for each string.
*/
uvector_pair normalize(char const* d_strings,
uint32_t const* d_offsets,
uint32_t num_strings,
cudf::size_type const* d_offsets,
cudf::size_type num_strings,
rmm::cuda_stream_view stream) const;

private:
Expand Down
9 changes: 6 additions & 3 deletions cpp/src/text/subword/detail/tokenizer_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,8 @@

#include <text/subword/detail/cp_data.h>

#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

Expand Down Expand Up @@ -47,8 +49,9 @@ constexpr int THREADS_PER_BLOCK = 64;
*/
struct update_strings_lengths_fn {
uint32_t const* d_chars_up_to_idx;
uint32_t* d_offsets;
__device__ void operator()(uint32_t idx)
cudf::size_type* d_offsets;

__device__ void operator()(cudf::size_type idx)
{
auto const offset = d_offsets[idx];
d_offsets[idx] = offset > 0 ? d_chars_up_to_idx[offset - 1] : 0;
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ class wordpiece_tokenizer {
* @return Pointer to token-ids and token-id offsets
*/
uvector_pair tokenize(char const* d_strings,
uint32_t const* d_offsets,
uint32_t num_strings,
cudf::size_type const* d_offsets,
cudf::size_type num_strings,
rmm::cuda_stream_view stream);

private:
Expand Down
21 changes: 13 additions & 8 deletions cpp/src/text/subword/subword_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ namespace {
__global__ void kernel_compute_tensor_metadata(
// input
uint32_t const* token_ids,
uint32_t const* offsets,
cudf::size_type const* offsets,
uint32_t const* row2tensor,
uint32_t const* row2row_within_tensor,
uint32_t max_sequence_length,
Expand All @@ -71,8 +71,13 @@ __global__ void kernel_compute_tensor_metadata(
uint32_t* attn_mask,
uint32_t* metadata)
{
uint32_t const output_idx = threadIdx.x + blockIdx.x * blockDim.x;
if (output_idx >= (nrows_tensor_token_ids * max_sequence_length)) return;
cudf::thread_index_type const output_idx =
threadIdx.x + static_cast<cudf::thread_index_type>(blockIdx.x) *
static_cast<cudf::thread_index_type>(blockDim.x);
if (output_idx >= (static_cast<cudf::thread_index_type>(nrows_tensor_token_ids) *
static_cast<cudf::thread_index_type>(max_sequence_length))) {
return;
}

uint32_t const absolute_row_id = output_idx / max_sequence_length;
uint32_t const tensor_id = row2tensor[absolute_row_id];
Expand Down Expand Up @@ -179,18 +184,18 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
std::overflow_error);

auto const offsets = strings.offsets();
auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
auto const offset = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
auto const d_chars = strings.chars().data<char>() + offset;
auto const d_offsets = offsets.data<cudf::size_type>() + strings.offset();
auto const offset = cudf::detail::get_value<cudf::size_type>(offsets, strings.offset(), stream);
auto const d_chars = strings.chars().data<char>() + offset;

// Create tokenizer
wordpiece_tokenizer tokenizer(
vocab_table, max_sequence_length, stride, do_truncate, do_lower_case);
// Run tokenizer
auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
// assign output components
uint32_t const* device_token_ids = tokens.first->data();
uint32_t const* device_offsets = tokens.second->data();
auto device_token_ids = tokens.first->data();
auto device_offsets = tokens.second->data();

// Format output from tokenizer
// Each string can create 1 or more tensor entries.
Expand Down
26 changes: 16 additions & 10 deletions cpp/src/text/subword/wordpiece_tokenizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
uint32_t* token_ids,
uint8_t* tokens_per_word)
{
uint32_t char_for_thread = blockDim.x * blockIdx.x + threadIdx.x;
cudf::thread_index_type char_for_thread = static_cast<cudf::thread_index_type>(blockDim.x) *
static_cast<cudf::thread_index_type>(blockIdx.x) +
threadIdx.x;

// Deal with the start_word_indices array
if (char_for_thread < num_code_points) {
Expand Down Expand Up @@ -130,12 +132,14 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
* @param num_strings The total number of strings to be processed.
*/
__global__ void mark_string_start_and_ends(uint32_t const* code_points,
uint32_t const* strings_offsets,
cudf::size_type const* strings_offsets,
uint32_t* start_word_indices,
uint32_t* end_word_indices,
uint32_t num_strings)
{
uint32_t idx = blockDim.x * blockIdx.x + threadIdx.x;
cudf::thread_index_type idx = static_cast<cudf::thread_index_type>(blockDim.x) *
static_cast<cudf::thread_index_type>(blockIdx.x) +
threadIdx.x;
// Ensure the starting character of each strings is written to the word start array.
if (idx <= num_strings) {
auto const offset = strings_offsets[idx];
Expand Down Expand Up @@ -330,7 +334,9 @@ __global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points,
uint32_t* token_ids,
uint8_t* tokens_per_word)
{
uint32_t const word_to_tokenize = blockDim.x * blockIdx.x + threadIdx.x;
cudf::thread_index_type word_to_tokenize = static_cast<cudf::thread_index_type>(blockDim.x) *
static_cast<cudf::thread_index_type>(blockIdx.x) +
threadIdx.x;

if (word_to_tokenize >= total_words) return;
// Each thread gets the start code_point offset for each word and resets the token_id memory to
Expand Down Expand Up @@ -414,8 +420,8 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
}

uvector_pair wordpiece_tokenizer::tokenize(char const* d_strings,
uint32_t const* d_offsets,
uint32_t num_strings,
cudf::size_type const* d_offsets,
cudf::size_type num_strings,
rmm::cuda_stream_view stream)
{
auto cps_and_offsets = normalizer.normalize(d_strings, d_offsets, num_strings, stream);
Expand All @@ -433,10 +439,10 @@ struct tranform_fn { // just converting uint8 value to uint32

void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream)
{
uint32_t* device_code_points = cps_and_offsets.first->data();
size_t const num_code_points = cps_and_offsets.first->size();
uint32_t* device_strings_offsets = cps_and_offsets.second->data();
uint32_t const num_strings = cps_and_offsets.second->size() - 1;
auto device_code_points = cps_and_offsets.first->data();
auto const num_code_points = cps_and_offsets.first->size();
auto device_strings_offsets = cps_and_offsets.second->data();
auto const num_strings = cps_and_offsets.second->size() - 1;

size_t const four_byte_cp_chunks = 1 + (num_code_points - 1) / sizeof(uint32_t);
size_t const rounded_num_cps = sizeof(uint32_t) * four_byte_cp_chunks;
Expand Down
Loading

0 comments on commit 0f41a7b

Please sign in to comment.