From 5d5f4f5ff8bfb3b9989de8d25a7edfccfd8a4ba8 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 25 Aug 2023 12:34:16 -0500 Subject: [PATCH 1/7] Fix type metadata issue preservation with `Column.unique` (#13957) closes #13953 This PR fixes an issue with `Column.unique` where the type-metadata wasn't being preserved in the end before returning the unique values. This lead to `IntervalColumn` being returned as a `StructColumn`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/13957 --- python/cudf/cudf/core/column/column.py | 4 +++- python/cudf/cudf/tests/test_interval.py | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c1ad5de1181..446f01ef419 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1162,7 +1162,9 @@ def unique(self) -> ColumnBase: """ Get unique values in the data """ - return drop_duplicates([self], keep="first")[0] + return drop_duplicates([self], keep="first")[0]._with_type_metadata( + self.dtype + ) def serialize(self) -> Tuple[dict, list]: # data model: diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index e1104829914..18454172289 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -1,5 +1,7 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. + +import numpy as np import pandas as pd import pytest @@ -132,3 +134,21 @@ def test_create_interval_df(data1, data2, data3, data4, closed): dtype="interval", ) assert_eq(expect_three, got_three) + + +def test_interval_index_unique(): + interval_list = [ + np.nan, + pd.Interval(2.0, 3.0, closed="right"), + pd.Interval(3.0, 4.0, closed="right"), + np.nan, + pd.Interval(3.0, 4.0, closed="right"), + pd.Interval(3.0, 4.0, closed="right"), + ] + pi = pd.Index(interval_list) + gi = cudf.from_pandas(pi) + + expected = pi.unique() + actual = gi.unique() + + assert_eq(expected, actual) From 80d9b1aabd8caeca1d40318dc427d898f2608e21 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Fri, 25 Aug 2023 12:52:54 -0500 Subject: [PATCH 2/7] Handle `as_index` in `GroupBy.apply` (#13951) Closes https://github.com/rapidsai/cudf/issues/13897 Authors: - https://github.com/brandon-b-miller Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/13951 --- python/cudf/cudf/core/groupby/groupby.py | 3 +++ python/cudf/cudf/tests/test_groupby.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 63c9dd837a8..cf4c861c28f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1436,6 +1436,9 @@ def mult(df): if self._sort: result = result.sort_index() + if self._as_index is False: + result = result.reset_index() + result[None] = result.pop(0) return result @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 784cabaa542..b48ce210104 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -180,6 +180,16 @@ def test_groupby_as_index_single_agg(pdf, gdf, as_index): assert_groupby_results_equal(pdf, gdf) +@pytest.mark.parametrize("engine", ["cudf", "jit"]) +@pytest.mark.parametrize("as_index", [True, False]) +def test_groupby_as_index_apply(pdf, gdf, as_index, engine): + gdf = gdf.groupby("y", as_index=as_index).apply( + lambda df: df["x"].mean(), engine=engine + ) + pdf = pdf.groupby("y", as_index=as_index).apply(lambda df: df["x"].mean()) + assert_groupby_results_equal(pdf, gdf) + + @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_as_index_multiindex(pdf, gdf, as_index): pdf = pd.DataFrame( From 4591dd3f14701da061872eb868641964383fece5 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 25 Aug 2023 14:18:45 -0400 Subject: [PATCH 3/7] Use cudf::size_type instead of int32 where appropriate in nvtext functions (#13915) Updates code to use `size_type` instead of `int32_t` where appropriate (i.e. offsets). Also changes some code logic for resolving a thread-index in a custom kernel to use the `cudf::thread_index_type` to help avoid overflow of 32-bit integer types. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/13915 --- cpp/src/text/normalize.cu | 18 ++++---- cpp/src/text/replace.cu | 2 +- cpp/src/text/stemmer.cu | 26 ++++++----- cpp/src/text/subword/data_normalizer.cu | 45 ++++++++++--------- .../text/subword/detail/data_normalizer.hpp | 10 +++-- .../text/subword/detail/tokenizer_utils.cuh | 9 ++-- .../subword/detail/wordpiece_tokenizer.hpp | 4 +- cpp/src/text/subword/subword_tokenize.cu | 21 +++++---- cpp/src/text/subword/wordpiece_tokenizer.cu | 26 ++++++----- cpp/src/text/tokenize.cu | 44 +++++++++--------- cpp/src/text/utilities/tokenize_ops.cuh | 4 +- 11 files changed, 118 insertions(+), 91 deletions(-) diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index bc2b0607193..78dfb6bf1a6 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -107,8 +107,8 @@ constexpr uint32_t UTF8_3BYTE = 0x01'0000; struct codepoint_to_utf8_fn { cudf::column_device_view const d_strings; // input strings uint32_t const* cp_data; // full code-point array - int32_t const* d_cp_offsets{}; // offsets to each string's code-point array - int32_t* d_offsets{}; // offsets for the output strings + cudf::size_type const* d_cp_offsets{}; // offsets to each string's code-point array + cudf::size_type* d_offsets{}; // offsets for the output strings char* d_chars{}; // buffer for the output strings column /** @@ -118,7 +118,7 @@ struct codepoint_to_utf8_fn { * @param count number of code-points in `str_cps` * @return Number of bytes required for the output */ - __device__ int32_t compute_output_size(uint32_t const* str_cps, uint32_t count) + __device__ cudf::size_type compute_output_size(uint32_t const* str_cps, uint32_t count) { return thrust::transform_reduce( thrust::seq, @@ -126,7 +126,7 @@ struct codepoint_to_utf8_fn { str_cps + count, [](auto cp) { return 1 + (cp >= UTF8_1BYTE) + (cp >= UTF8_2BYTE) + (cp >= UTF8_3BYTE); }, 0, - thrust::plus()); + thrust::plus()); } __device__ void operator()(cudf::size_type idx) @@ -208,9 +208,9 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto const aux_table = get_aux_codepoint_data(stream); auto const normalizer = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case); auto const offsets = strings.offsets(); - auto const d_offsets = offsets.data() + strings.offset(); - auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); - auto const d_chars = strings.chars().data() + offset; + auto const d_offsets = offsets.data() + strings.offset(); + auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); + auto const d_chars = strings.chars().data() + offset; return normalizer.normalize(d_chars, d_offsets, strings.size(), stream); }(); @@ -222,8 +222,8 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con // convert the result into a strings column // - the cp_chars are the new 4-byte code-point values for all the characters in the output // - the cp_offsets identify which code-points go with which strings - uint32_t const* cp_chars = result.first->data(); - int32_t const* cp_offsets = reinterpret_cast(result.second->data()); + uint32_t const* cp_chars = result.first->data(); + cudf::size_type const* cp_offsets = result.second->data(); auto d_strings = cudf::column_device_view::create(strings.parent(), stream); diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index 3cfaece64d7..d122f048a4e 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -47,7 +47,7 @@ using replace_result = thrust::pair; struct base_token_replacer_fn { cudf::column_device_view const d_strings; ///< strings to tokenize cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing - int32_t* d_offsets{}; ///< for locating output string in d_chars + cudf::size_type* d_offsets{}; ///< for locating output string in d_chars char* d_chars{}; ///< output buffer /** diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu index 6aad75bef71..2b2b8429d9c 100644 --- a/cpp/src/text/stemmer.cu +++ b/cpp/src/text/stemmer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -184,17 +184,19 @@ struct dispatch_is_letter_fn { struct porter_stemmer_measure_fn { cudf::column_device_view const d_strings; // strings to measure - __device__ int32_t operator()(cudf::size_type idx) const + __device__ cudf::size_type operator()(cudf::size_type idx) const { - if (d_strings.is_null(idx)) return 0; + if (d_strings.is_null(idx)) { return 0; } cudf::string_view d_str = d_strings.element(idx); - if (d_str.empty()) return 0; - int32_t measure = 0; - auto itr = d_str.begin(); - bool vowel_run = !is_consonant(itr); + if (d_str.empty()) { return 0; } + + cudf::size_type measure = 0; + + auto itr = d_str.begin(); + bool vowel_run = !is_consonant(itr); while (itr != d_str.end()) { if (is_consonant(itr)) { - if (vowel_run) measure++; + if (vowel_run) { measure++; } vowel_run = false; } else { vowel_run = true; @@ -211,11 +213,13 @@ std::unique_ptr porter_stemmer_measure(cudf::strings_column_view c rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); + if (strings.is_empty()) { + return cudf::make_empty_column(cudf::data_type{cudf::type_to_id()}); + } // create empty output column auto results = - cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, + cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, strings.size(), cudf::detail::copy_bitmask(strings.parent(), stream, mr), strings.null_count(), @@ -226,7 +230,7 @@ std::unique_ptr porter_stemmer_measure(cudf::strings_column_view c thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings.size()), - results->mutable_view().data(), + results->mutable_view().data(), porter_stemmer_measure_fn{*strings_column}); results->set_null_count(strings.null_count()); return results; diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 596c8d26e65..34eb95bea5c 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -124,9 +124,10 @@ __device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) * @param start_byte_for_thread Which byte to start analyzing * @return New code point value for this byte. */ -__device__ uint32_t extract_code_points_from_utf8(unsigned char const* strings, - size_t const total_bytes, - uint32_t const start_byte_for_thread) +__device__ uint32_t +extract_code_points_from_utf8(unsigned char const* strings, + size_t const total_bytes, + cudf::thread_index_type const start_byte_for_thread) { constexpr uint8_t max_utf8_blocks_for_char = 4; uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0}; @@ -214,8 +215,9 @@ __global__ void kernel_data_normalizer(unsigned char const* strings, constexpr uint32_t init_val = (1 << FILTER_BIT); uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val}; - uint32_t const char_for_thread = blockDim.x * blockIdx.x + threadIdx.x; - uint32_t num_new_chars = 0; + cudf::thread_index_type const char_for_thread = + threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x); + uint32_t num_new_chars = 0; if (char_for_thread < total_bytes) { auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread); @@ -273,31 +275,34 @@ data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata, } uvector_pair data_normalizer::normalize(char const* d_strings, - uint32_t const* d_offsets, - uint32_t num_strings, + cudf::size_type const* d_offsets, + cudf::size_type num_strings, rmm::cuda_stream_view stream) const { - if (num_strings == 0) - return std::pair(std::make_unique>(0, stream), - std::make_unique>(0, stream)); + if (num_strings == 0) { + return uvector_pair{std::make_unique>(0, stream), + std::make_unique>(0, stream)}; + } // copy offsets to working memory - size_t const num_offsets = num_strings + 1; - auto d_strings_offsets = std::make_unique>(num_offsets, stream); + auto const num_offsets = num_strings + 1; + auto d_strings_offsets = + std::make_unique>(num_offsets, stream); thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_offsets), + thrust::counting_iterator(0), + thrust::counting_iterator(num_offsets), d_strings_offsets->begin(), [d_offsets] __device__(auto idx) { auto const offset = d_offsets[0]; // adjust for any offset to the offsets return d_offsets[idx] - offset; }); - uint32_t const bytes_count = d_strings_offsets->element(num_strings, stream); - if (bytes_count == 0) // if no bytes, nothing to do - return std::pair(std::make_unique>(0, stream), - std::make_unique>(0, stream)); + auto const bytes_count = d_strings_offsets->element(num_strings, stream); + if (bytes_count == 0) { // if no bytes, nothing to do + return uvector_pair{std::make_unique>(0, stream), + std::make_unique>(0, stream)}; + } - cudf::detail::grid_1d const grid{static_cast(bytes_count), THREADS_PER_BLOCK, 1}; + cudf::detail::grid_1d const grid{bytes_count, THREADS_PER_BLOCK, 1}; size_t const threads_on_device = grid.num_threads_per_block * grid.num_blocks; size_t const max_new_char_total = MAX_NEW_CHARS * threads_on_device; @@ -333,7 +338,7 @@ uvector_pair data_normalizer::normalize(char const* d_strings, num_strings, update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()}); - uint32_t const num_chars = d_strings_offsets->element(num_strings, stream); + auto const num_chars = d_strings_offsets->element(num_strings, stream); d_code_points->resize(num_chars, stream); // should be smaller than original allocated size // return the normalized code points and the new offsets diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp index 927de5a74f9..fb507b88e7e 100644 --- a/cpp/src/text/subword/detail/data_normalizer.hpp +++ b/cpp/src/text/subword/detail/data_normalizer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,11 +18,13 @@ #include +#include + #include #include using uvector_pair = std::pair>, - std::unique_ptr>>; + std::unique_ptr>>; namespace nvtext { namespace detail { @@ -85,8 +87,8 @@ class data_normalizer { * used to locate the code points for each string. */ uvector_pair normalize(char const* d_strings, - uint32_t const* d_offsets, - uint32_t num_strings, + cudf::size_type const* d_offsets, + cudf::size_type num_strings, rmm::cuda_stream_view stream) const; private: diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh index 5e8de1ba244..7cc0e7c0e24 100644 --- a/cpp/src/text/subword/detail/tokenizer_utils.cuh +++ b/cpp/src/text/subword/detail/tokenizer_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ #include +#include + #include #include @@ -47,8 +49,9 @@ constexpr int THREADS_PER_BLOCK = 64; */ struct update_strings_lengths_fn { uint32_t const* d_chars_up_to_idx; - uint32_t* d_offsets; - __device__ void operator()(uint32_t idx) + cudf::size_type* d_offsets; + + __device__ void operator()(cudf::size_type idx) { auto const offset = d_offsets[idx]; d_offsets[idx] = offset > 0 ? d_chars_up_to_idx[offset - 1] : 0; diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp index 2f528dce897..e191890eeca 100644 --- a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp +++ b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp @@ -82,8 +82,8 @@ class wordpiece_tokenizer { * @return Pointer to token-ids and token-id offsets */ uvector_pair tokenize(char const* d_strings, - uint32_t const* d_offsets, - uint32_t num_strings, + cudf::size_type const* d_offsets, + cudf::size_type num_strings, rmm::cuda_stream_view stream); private: diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 47f602362f2..1a3084a257f 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -59,7 +59,7 @@ namespace { __global__ void kernel_compute_tensor_metadata( // input uint32_t const* token_ids, - uint32_t const* offsets, + cudf::size_type const* offsets, uint32_t const* row2tensor, uint32_t const* row2row_within_tensor, uint32_t max_sequence_length, @@ -71,8 +71,13 @@ __global__ void kernel_compute_tensor_metadata( uint32_t* attn_mask, uint32_t* metadata) { - uint32_t const output_idx = threadIdx.x + blockIdx.x * blockDim.x; - if (output_idx >= (nrows_tensor_token_ids * max_sequence_length)) return; + cudf::thread_index_type const output_idx = + threadIdx.x + static_cast(blockIdx.x) * + static_cast(blockDim.x); + if (output_idx >= (static_cast(nrows_tensor_token_ids) * + static_cast(max_sequence_length))) { + return; + } uint32_t const absolute_row_id = output_idx / max_sequence_length; uint32_t const tensor_id = row2tensor[absolute_row_id]; @@ -179,9 +184,9 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, std::overflow_error); auto const offsets = strings.offsets(); - auto const d_offsets = offsets.data() + strings.offset(); - auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); - auto const d_chars = strings.chars().data() + offset; + auto const d_offsets = offsets.data() + strings.offset(); + auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); + auto const d_chars = strings.chars().data() + offset; // Create tokenizer wordpiece_tokenizer tokenizer( @@ -189,8 +194,8 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, // Run tokenizer auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream); // assign output components - uint32_t const* device_token_ids = tokens.first->data(); - uint32_t const* device_offsets = tokens.second->data(); + auto device_token_ids = tokens.first->data(); + auto device_offsets = tokens.second->data(); // Format output from tokenizer // Each string can create 1 or more tensor entries. diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index b6f6b7eda25..3b912017320 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -82,7 +82,9 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin uint32_t* token_ids, uint8_t* tokens_per_word) { - uint32_t char_for_thread = blockDim.x * blockIdx.x + threadIdx.x; + cudf::thread_index_type char_for_thread = static_cast(blockDim.x) * + static_cast(blockIdx.x) + + threadIdx.x; // Deal with the start_word_indices array if (char_for_thread < num_code_points) { @@ -130,12 +132,14 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin * @param num_strings The total number of strings to be processed. */ __global__ void mark_string_start_and_ends(uint32_t const* code_points, - uint32_t const* strings_offsets, + cudf::size_type const* strings_offsets, uint32_t* start_word_indices, uint32_t* end_word_indices, uint32_t num_strings) { - uint32_t idx = blockDim.x * blockIdx.x + threadIdx.x; + cudf::thread_index_type idx = static_cast(blockDim.x) * + static_cast(blockIdx.x) + + threadIdx.x; // Ensure the starting character of each strings is written to the word start array. if (idx <= num_strings) { auto const offset = strings_offsets[idx]; @@ -330,7 +334,9 @@ __global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points, uint32_t* token_ids, uint8_t* tokens_per_word) { - uint32_t const word_to_tokenize = blockDim.x * blockIdx.x + threadIdx.x; + cudf::thread_index_type word_to_tokenize = static_cast(blockDim.x) * + static_cast(blockIdx.x) + + threadIdx.x; if (word_to_tokenize >= total_words) return; // Each thread gets the start code_point offset for each word and resets the token_id memory to @@ -414,8 +420,8 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table, } uvector_pair wordpiece_tokenizer::tokenize(char const* d_strings, - uint32_t const* d_offsets, - uint32_t num_strings, + cudf::size_type const* d_offsets, + cudf::size_type num_strings, rmm::cuda_stream_view stream) { auto cps_and_offsets = normalizer.normalize(d_strings, d_offsets, num_strings, stream); @@ -433,10 +439,10 @@ struct tranform_fn { // just converting uint8 value to uint32 void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream) { - uint32_t* device_code_points = cps_and_offsets.first->data(); - size_t const num_code_points = cps_and_offsets.first->size(); - uint32_t* device_strings_offsets = cps_and_offsets.second->data(); - uint32_t const num_strings = cps_and_offsets.second->size() - 1; + auto device_code_points = cps_and_offsets.first->data(); + auto const num_code_points = cps_and_offsets.first->size(); + auto device_strings_offsets = cps_and_offsets.second->data(); + auto const num_strings = cps_and_offsets.second->size() - 1; size_t const four_byte_cp_chunks = 1 + (num_code_points - 1) / sizeof(uint32_t); size_t const rounded_num_cps = sizeof(uint32_t) * four_byte_cp_chunks; diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index 8604152099c..16b9f25b802 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -52,12 +52,13 @@ std::unique_ptr token_count_fn(cudf::size_type strings_count, rmm::mr::device_memory_resource* mr) { // create output column - auto token_counts = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, - strings_count, - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto d_token_counts = token_counts->mutable_view().data(); + auto token_counts = + cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, + strings_count, + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_token_counts = token_counts->mutable_view().data(); // add the counts to the column thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -79,10 +80,10 @@ std::unique_ptr tokenize_fn(cudf::size_type strings_count, token_count_fn(strings_count, tokenizer, stream, rmm::mr::get_current_device_resource()); auto d_token_counts = token_counts->view(); // create token-index offsets from the counts - rmm::device_uvector token_offsets(strings_count + 1, stream); + rmm::device_uvector token_offsets(strings_count + 1, stream); thrust::inclusive_scan(rmm::exec_policy(stream), - d_token_counts.template begin(), - d_token_counts.template end(), + d_token_counts.template begin(), + d_token_counts.template end(), token_offsets.begin() + 1); token_offsets.set_element_to_zero_async(0, stream); auto const total_tokens = token_offsets.back_element(stream); @@ -177,10 +178,10 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const } auto offsets = strings_column.offsets(); - auto offset = cudf::detail::get_value(offsets, strings_column.offset(), stream); - auto chars_bytes = - cudf::detail::get_value(offsets, strings_column.offset() + strings_count, stream) - - offset; + auto offset = cudf::detail::get_value(offsets, strings_column.offset(), stream); + auto chars_bytes = cudf::detail::get_value( + offsets, strings_column.offset() + strings_count, stream) - + offset; auto d_chars = strings_column.chars().data(); // unsigned is necessary for checking bits d_chars += offset; @@ -200,16 +201,17 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const // create output offsets column // -- conditionally copy a counting iterator where // the first byte of each character is located - auto offsets_column = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, - num_characters + 1, - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto d_new_offsets = offsets_column->mutable_view().begin(); + auto offsets_column = + cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, + num_characters + 1, + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_new_offsets = offsets_column->mutable_view().begin(); thrust::copy_if( rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes + 1), + thrust::counting_iterator(0), + thrust::counting_iterator(chars_bytes + 1), d_new_offsets, [d_chars, chars_bytes] __device__(auto idx) { // this will also set the final value to the size chars_bytes diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh index 89825e31e5c..fbd2d1efcff 100644 --- a/cpp/src/text/utilities/tokenize_ops.cuh +++ b/cpp/src/text/utilities/tokenize_ops.cuh @@ -149,7 +149,7 @@ struct characters_tokenizer { struct strings_tokenizer { cudf::column_device_view const d_strings; ///< strings to tokenize cudf::string_view const d_delimiter; ///< delimiter characters to tokenize around - int32_t* d_offsets{}; ///< offsets into the d_tokens vector for each string + cudf::size_type* d_offsets{}; ///< offsets into the d_tokens vector for each string string_index_pair* d_tokens{}; ///< token positions in device memory /** @@ -194,7 +194,7 @@ struct multi_delimiter_strings_tokenizer { cudf::column_device_view const d_strings; ///< strings column to tokenize delimiterator delimiters_begin; ///< first delimiter delimiterator delimiters_end; ///< last delimiter - int32_t* d_offsets{}; ///< offsets into the d_tokens output vector + cudf::size_type* d_offsets{}; ///< offsets into the d_tokens output vector string_index_pair* d_tokens{}; ///< token positions found for each string /** From ec1e73f8d04563c95fb5e0eb775c2e8c65ee0d64 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 25 Aug 2023 13:49:28 -0500 Subject: [PATCH 4/7] Fix an issue with `IntervalIndex.repr` when null values are present (#13958) closes #13954 This PR fixes an issue with `IntervalIndex.repr`, where there was a silent failure because of no dedicated `_clean_nulls_from_index` and the `GenericIndex._clean_nulls_from_index` wouldn't work because a type-cast to `str` isn't implemented. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/13958 --- python/cudf/cudf/core/dtypes.py | 5 ++++- python/cudf/cudf/core/index.py | 5 ++++- python/cudf/cudf/tests/test_repr.py | 13 +++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index a83c1f7b3c9..5fb092c7cbc 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -905,9 +905,12 @@ def __init__(self, subtype, closed="right"): def subtype(self): return self.fields["left"] - def __repr__(self): + def __repr__(self) -> str: return f"interval[{self.subtype}, {self.closed}]" + def __str__(self) -> str: + return self.__repr__() + @classmethod def from_arrow(cls, typ): return IntervalDtype(typ.subtype.to_pandas_dtype(), typ.closed) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 458ef2df02d..c7e25cdc430 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3187,7 +3187,7 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None): >>> import cudf >>> import pandas as pd >>> cudf.IntervalIndex.from_breaks([0, 1, 2, 3]) - IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval') + IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval[int64, right]') """ if copy: breaks = column.as_column(breaks, dtype=dtype).copy() @@ -3211,6 +3211,9 @@ def _is_interval(self): def _is_boolean(self): return False + def _clean_nulls_from_index(self): + return self + class StringIndex(GenericIndex): """String defined indices into another Column diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index b944e0483d0..a36cc1b3819 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1469,3 +1469,16 @@ def test_repr_struct_after_concat(): pdf = df.to_pandas() assert repr(df) == repr(pdf) + + +def test_interval_index_repr(): + pi = pd.Index( + [ + np.nan, + pd.Interval(2.0, 3.0, closed="right"), + pd.Interval(3.0, 4.0, closed="right"), + ] + ) + gi = cudf.from_pandas(pi) + + assert repr(pi) == repr(gi) From 6d10a82076ffbd5530d0d0b5f4c6277d2a4f9d7a Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 25 Aug 2023 14:35:48 -0500 Subject: [PATCH 5/7] Fix `MultiIndex.to_numpy` to return numpy array with tuples (#13966) closes #13961 The absence of `MultiIndex.to_numpy` resulted in calling of `Frame.to_numpy` that returns a numpy array without any tuples, this PR adds `MultiIndex.to_numpy` so that a numpy array of tuples is returned. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/13966 --- python/cudf/cudf/core/multiindex.py | 4 ++++ python/cudf/cudf/tests/test_multiindex.py | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 5ab9af36175..eb953a54f6b 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1151,6 +1151,10 @@ def from_tuples(cls, tuples, names=None): pdi = pd.MultiIndex.from_tuples(tuples, names=names) return cls.from_pandas(pdi) + @_cudf_nvtx_annotate + def to_numpy(self): + return self.values_host + @property # type: ignore @_cudf_nvtx_annotate def values_host(self): diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index a4099bb7f88..464b9623bad 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -1182,6 +1182,17 @@ def test_multiindex_values_host(): assert_eq(midx.values_host, pmidx.values) +def test_multiindex_to_numpy(): + midx = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + pmidx = midx.to_pandas() + + assert_eq(midx.to_numpy(), pmidx.to_numpy()) + + @pytest.mark.parametrize( "gdi, fill_value, expected", [ From 89787f24b957408d051791ebe725d5eee30c4814 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 25 Aug 2023 14:44:12 -0500 Subject: [PATCH 6/7] Handle `Interval` scalars when passed in list-like inputs to `cudf.Index` (#13956) closes #13952 This PR fixes an issue with `IntervalColumn` construction where we can utilize the existing type inference to create a pandas Series and then construct an `IntervalColumn` out of it since pyarrow is unable to read this kind of input correctly. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/13956 --- python/cudf/cudf/core/column/column.py | 10 +++++++--- python/cudf/cudf/tests/test_interval.py | 13 +++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 446f01ef419..eafcc18450d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2454,25 +2454,29 @@ def as_column( def _construct_array( arbitrary: Any, dtype: Optional[Dtype] -) -> Union[np.ndarray, cupy.ndarray]: +) -> Union[np.ndarray, cupy.ndarray, pd.api.extensions.ExtensionArray]: """ - Construct a CuPy or NumPy array from `arbitrary` + Construct a CuPy/NumPy/Pandas array from `arbitrary` """ try: dtype = dtype if dtype is None else cudf.dtype(dtype) arbitrary = cupy.asarray(arbitrary, dtype=dtype) except (TypeError, ValueError): native_dtype = dtype + inferred_dtype = None if ( dtype is None and not cudf._lib.scalar._is_null_host_scalar(arbitrary) - and infer_dtype(arbitrary, skipna=False) + and (inferred_dtype := infer_dtype(arbitrary, skipna=False)) in ( "mixed", "mixed-integer", ) ): native_dtype = "object" + if inferred_dtype == "interval": + # Only way to construct an Interval column. + return pd.array(arbitrary) arbitrary = np.asarray( arbitrary, dtype=native_dtype diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index 18454172289..f2e8f585a69 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -136,6 +136,19 @@ def test_create_interval_df(data1, data2, data3, data4, closed): assert_eq(expect_three, got_three) +def test_create_interval_index_from_list(): + interval_list = [ + np.nan, + pd.Interval(2.0, 3.0, closed="right"), + pd.Interval(3.0, 4.0, closed="right"), + ] + + expected = pd.Index(interval_list) + actual = cudf.Index(interval_list) + + assert_eq(expected, actual) + + def test_interval_index_unique(): interval_list = [ np.nan, From b6d08cae87aa489706a1fc1eefde5c1efe3f3ebf Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 25 Aug 2023 15:31:41 -0500 Subject: [PATCH 7/7] Switch pylibcudf-enabled types to use enum class in Cython (#13931) This PR leverages Cython 3's support for C++'s scoped enumerations to more faithfully translate libcudf types into cuDF Cython and pylibcudf. Due to some Cython 3 limitations, there are a few workarounds in this PR. However, they are relatively minor and can be adjusted later as Cython improves. In the meantime, the change here is an improvement still worth merging, especially since it sets a template for how pylibcudf code should look as more of it emerges. Existing cuDF Cython is only updated to the minimum extent required for it to compile. Fully switching the old code to use enum class-style syntax isn't worthwhile since those internals should eventually be migrated to use pylibcudf in pure Python mode anyway. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/13931 --- docs/cudf/source/developer_guide/pylibcudf.md | 66 +++++++++++++++---- python/cudf/cudf/_lib/CMakeLists.txt | 1 + python/cudf/cudf/_lib/cpp/CMakeLists.txt | 23 +++++++ python/cudf/cudf/_lib/cpp/copying.pxd | 6 +- python/cudf/cudf/_lib/cpp/copying.pyx | 0 python/cudf/cudf/_lib/cpp/types.pxd | 65 +++++++++--------- python/cudf/cudf/_lib/cpp/types.pyx | 0 python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 5 +- python/cudf/cudf/_lib/pylibcudf/column.pyx | 5 +- python/cudf/cudf/_lib/pylibcudf/copying.pxd | 20 +----- python/cudf/cudf/_lib/pylibcudf/copying.pyx | 20 +++--- python/cudf/cudf/_lib/pylibcudf/types.pxd | 55 +--------------- python/cudf/cudf/_lib/pylibcudf/types.pyx | 16 ++--- python/cudf/cudf/_lib/scalar.pyx | 56 ++++++++-------- .../strings/convert/convert_fixed_point.pyx | 12 ++-- python/cudf/cudf/_lib/types.pyx | 1 + 16 files changed, 177 insertions(+), 174 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/CMakeLists.txt create mode 100644 python/cudf/cudf/_lib/cpp/copying.pyx create mode 100644 python/cudf/cudf/_lib/cpp/types.pyx diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md index 4940e329653..1b321dbb1fe 100644 --- a/docs/cudf/source/developer_guide/pylibcudf.md +++ b/docs/cudf/source/developer_guide/pylibcudf.md @@ -86,7 +86,7 @@ cpdef Table gather( cpp_copying.gather( source_table.view(), gather_map.view(), - py_policy_to_c_policy(bounds_policy) + bounds_policy ) ) return Table.from_libcudf(move(c_result)) @@ -94,18 +94,62 @@ cpdef Table gather( There are a couple of notable points from the snippet above: - The object returned from libcudf is immediately converted to a pylibcudf type. -- `cudf::gather` accepts a `cudf::out_of_bounds_policy` enum parameter, which is mirrored by the `cdef `class OutOfBoundsPolicy` as mentioned in [the data structures example above](data-structures). +- `cudf::gather` accepts a `cudf::out_of_bounds_policy` enum parameter. `OutOfBoundsPolicy` is an alias for this type in pylibcudf that matches our Python naming conventions (CapsCase instead of snake\_case). ## Miscellaneous Notes -### Cython Scoped Enums and Casting -Cython does not support scoped enumerations. -It assumes that enums correspond to their underlying value types and will thus attempt operations that are invalid. -To fix this, many places in pylibcudf Cython code contain double casts that look like +### Cython Scoped Enums +Cython 3 introduced support for scoped enumerations. +However, this support has some bugs as well as some easy pitfalls. +Our usage of enums is intended to minimize the complexity of our code while also working around Cython's limitations. + +```{warning} +The guidance in this section may change often as Cython is updated and our understanding of best practices evolves. +``` + +- All pxd files that declare a C++ enum should use `cpdef enum class` declarations. + - Reason: This declaration makes the C++ enum available in Cython code while also transparently creating a Python enum. +- Any pxd file containing only C++ declarations must still have a corresponding pyx file if any of the declarations are scoped enums. + - Reason: The creation of the Python enum requires that Cython actually generate the necessary Python C API code, which will not happen if only a pxd file is present. +- If a C++ enum will be part of a pylibcudf module's public API, then it should be imported (not cimported) directly into the pyx file and aliased with a name that matches our Python class naming conventions (CapsCase) instead of our C++ naming convention (snake\_case). + - Reason: We want to expose the enum to both Python and Cython consumers of the module. As a side effect, this aliasing avoids [this Cython bug](https://github.com/cython/cython/issues/5609). + - Note: Once the above Cython bug is resolved, the enum should also be aliased into the pylibcudf pxd file when it is cimported so that Python and Cython usage will match. + +Here is an example of appropriate enum usage. + + ```cython -return ( - py_policy -) +# cpp/copying.pxd +cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: + # cpdef here so that we export both a cdef enum class and a Python enum.Enum. + cpdef enum class out_of_bounds_policy(bool): + NULLIFY + DONT_CHECK + + +# cpp/copying.pyx +# This file is empty, but is required to compile the Python enum in cpp/copying.pxd + + +# pylibcudf/copying.pxd + +# cimport the enum using the exact name +# Once https://github.com/cython/cython/issues/5609 is resolved, +# this import should instead be +# from cudf._lib.cpp.copying cimport out_of_bounds_policy as OutOfBoundsPolicy +from cudf._lib.cpp.copying cimport out_of_bounds_policy + + +# pylibcudf/copying.pyx +# Access cpp.copying members that aren't part of this module's public API via +# this module alias +from cudf._lib.cpp cimport copying as cpp_copying +from cudf._lib.cpp.copying cimport out_of_bounds_policy + +# This import exposes the enum in the public API of this module. +# It requires a no-cython-lint tag because it will be unused: all typing of +# parameters etc will need to use the Cython name `out_of_bounds_policy` until +# the Cython bug is resolved. +from cudf._lib.cpp.copying import \ + out_of_bounds_policy as OutOfBoundsPolicy # no-cython-lint ``` -where `cpp_type` is some libcudf enum with a specified underlying type. -This double-cast will be removed when we migrate to Cython 3, which adds proper support for C++ scoped enumerations. diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 37544e1c7cd..06de6cc825f 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -106,6 +106,7 @@ foreach(target IN LISTS targets_using_arrow_headers) target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") endforeach() +add_subdirectory(cpp) add_subdirectory(io) add_subdirectory(nvtext) add_subdirectory(pylibcudf) diff --git a/python/cudf/cudf/_lib/cpp/CMakeLists.txt b/python/cudf/cudf/_lib/cpp/CMakeLists.txt new file mode 100644 index 00000000000..a99aa58dfe8 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/CMakeLists.txt @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources copying.pyx types.pyx) + +set(linked_libraries cudf::cudf) + +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp +) diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd index 8961675711f..20725c252fc 100644 --- a/python/cudf/cudf/_lib/cpp/copying.pxd +++ b/python/cudf/cudf/_lib/cpp/copying.pxd @@ -19,9 +19,9 @@ from cudf._lib.exception_handler cimport cudf_exception_handler ctypedef const scalar constscalar cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: - ctypedef enum out_of_bounds_policy: - NULLIFY 'cudf::out_of_bounds_policy::NULLIFY' - DONT_CHECK 'cudf::out_of_bounds_policy::DONT_CHECK' + cpdef enum class out_of_bounds_policy(bool): + NULLIFY + DONT_CHECK cdef unique_ptr[table] gather ( const table_view& source_table, diff --git a/python/cudf/cudf/_lib/cpp/copying.pyx b/python/cudf/cudf/_lib/cpp/copying.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd index 11480d774ef..14bf8a83de0 100644 --- a/python/cudf/cudf/_lib/cpp/types.pxd +++ b/python/cudf/cudf/_lib/cpp/types.pxd @@ -4,6 +4,11 @@ from libc.stdint cimport int32_t, uint32_t cdef extern from "cudf/types.hpp" namespace "cudf" nogil: + # The declaration below is to work around + # https://github.com/cython/cython/issues/5637 + """ + #define __PYX_ENUM_CLASS_DECL enum + """ ctypedef int32_t size_type ctypedef uint32_t bitmask_type ctypedef uint32_t char_utf8 @@ -49,36 +54,36 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: ALL_EQUAL "cudf::nan_equality::ALL_EQUAL" NANS_UNEQUAL "cudf::nan_equality::UNEQUAL" - ctypedef enum type_id "cudf::type_id": - EMPTY "cudf::type_id::EMPTY" - INT8 "cudf::type_id::INT8" - INT16 "cudf::type_id::INT16" - INT32 "cudf::type_id::INT32" - INT64 "cudf::type_id::INT64" - UINT8 "cudf::type_id::UINT8" - UINT16 "cudf::type_id::UINT16" - UINT32 "cudf::type_id::UINT32" - UINT64 "cudf::type_id::UINT64" - FLOAT32 "cudf::type_id::FLOAT32" - FLOAT64 "cudf::type_id::FLOAT64" - BOOL8 "cudf::type_id::BOOL8" - TIMESTAMP_DAYS "cudf::type_id::TIMESTAMP_DAYS" - TIMESTAMP_SECONDS "cudf::type_id::TIMESTAMP_SECONDS" - TIMESTAMP_MILLISECONDS "cudf::type_id::TIMESTAMP_MILLISECONDS" - TIMESTAMP_MICROSECONDS "cudf::type_id::TIMESTAMP_MICROSECONDS" - TIMESTAMP_NANOSECONDS "cudf::type_id::TIMESTAMP_NANOSECONDS" - DICTIONARY32 "cudf::type_id::DICTIONARY32" - STRING "cudf::type_id::STRING" - LIST "cudf::type_id::LIST" - STRUCT "cudf::type_id::STRUCT" - NUM_TYPE_IDS "cudf::type_id::NUM_TYPE_IDS" - DURATION_SECONDS "cudf::type_id::DURATION_SECONDS" - DURATION_MILLISECONDS "cudf::type_id::DURATION_MILLISECONDS" - DURATION_MICROSECONDS "cudf::type_id::DURATION_MICROSECONDS" - DURATION_NANOSECONDS "cudf::type_id::DURATION_NANOSECONDS" - DECIMAL32 "cudf::type_id::DECIMAL32" - DECIMAL64 "cudf::type_id::DECIMAL64" - DECIMAL128 "cudf::type_id::DECIMAL128" + cpdef enum class type_id(int32_t): + EMPTY + INT8 + INT16 + INT32 + INT64 + UINT8 + UINT16 + UINT32 + UINT64 + FLOAT32 + FLOAT64 + BOOL8 + TIMESTAMP_DAYS + TIMESTAMP_SECONDS + TIMESTAMP_MILLISECONDS + TIMESTAMP_MICROSECONDS + TIMESTAMP_NANOSECONDS + DICTIONARY32 + STRING + LIST + STRUCT + NUM_TYPE_IDS + DURATION_SECONDS + DURATION_MILLISECONDS + DURATION_MICROSECONDS + DURATION_NANOSECONDS + DECIMAL32 + DECIMAL64 + DECIMAL128 cdef cppclass data_type: data_type() except + diff --git a/python/cudf/cudf/_lib/cpp/types.pyx b/python/cudf/cudf/_lib/cpp/types.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index b4f8bfad4fb..ba7822b0a54 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -5,13 +5,14 @@ from . cimport copying from .column cimport Column from .gpumemoryview cimport gpumemoryview from .table cimport Table -from .types cimport DataType, TypeId +# TODO: cimport type_id once +# https://github.com/cython/cython/issues/5609 is resolved +from .types cimport DataType __all__ = [ "Column", "DataType", "Table", - "TypeId", "copying", "gpumemoryview", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index d9b2ca98ead..40afc8aaa8a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -9,7 +9,7 @@ from cudf._lib.cpp.column.column cimport column, column_contents from cudf._lib.cpp.types cimport size_type from .gpumemoryview cimport gpumemoryview -from .types cimport DataType, TypeId +from .types cimport DataType, type_id from .utils cimport int_to_bitmask_ptr, int_to_void_ptr @@ -179,10 +179,11 @@ cdef class Column: cpdef list children(self): return self._children + cdef class ListColumnView: """Accessor for methods of a Column that are specific to lists.""" def __init__(self, Column col): - if col.type().id() != TypeId.LIST: + if col.type().id() != type_id.LIST: raise TypeError("Column is not a list type") self._column = col diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd index 0ebffacfb9f..d57be650710 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd @@ -2,30 +2,14 @@ from libcpp cimport bool as cbool -from cudf._lib.cpp cimport copying as cpp_copying +from cudf._lib.cpp.copying cimport out_of_bounds_policy from .column cimport Column from .table cimport Table -ctypedef cbool underlying_type_t_out_of_bounds_policy - - -# Enum representing possible enum policies. This is the Cython representation -# of libcudf's out_of_bounds_policy. -cpdef enum OutOfBoundsPolicy: - NULLIFY = cpp_copying.NULLIFY - DONT_CHECK = ( - cpp_copying.DONT_CHECK - ) - - -cdef cpp_copying.out_of_bounds_policy py_policy_to_c_policy( - OutOfBoundsPolicy py_policy -) nogil - cpdef Table gather( Table source_table, Column gather_map, - OutOfBoundsPolicy bounds_policy + out_of_bounds_policy bounds_policy ) diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx index 7869a917983..a27b44b3107 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx @@ -8,27 +8,23 @@ from libcpp.utility cimport move # we really want here would be # cimport libcudf... libcudf.copying.algo(...) from cudf._lib.cpp cimport copying as cpp_copying +from cudf._lib.cpp.copying cimport out_of_bounds_policy + +from cudf._lib.cpp.copying import \ + out_of_bounds_policy as OutOfBoundsPolicy # no-cython-lint + from cudf._lib.cpp.table.table cimport table from .column cimport Column from .table cimport Table -cdef inline cpp_copying.out_of_bounds_policy py_policy_to_c_policy( - OutOfBoundsPolicy py_policy -) nogil: - """Convert a Cython policy the corresponding libcudf policy type.""" - return ( - py_policy - ) - - # TODO: Is it OK to reference the corresponding libcudf algorithm in the # documentation? Otherwise there's a lot of room for duplication. cpdef Table gather( Table source_table, Column gather_map, - OutOfBoundsPolicy bounds_policy + out_of_bounds_policy bounds_policy ): """Select rows from source_table according to the provided gather_map. @@ -40,7 +36,7 @@ cpdef Table gather( The table object from which to pull data. gather_map : Column The list of row indices to pull out of the source table. - bounds_policy : OutOfBoundsPolicy + bounds_policy : out_of_bounds_policy Controls whether out of bounds indices are checked and nullified in the output or if indices are assumed to be in bounds. @@ -55,7 +51,7 @@ cpdef Table gather( cpp_copying.gather( source_table.view(), gather_map.view(), - py_policy_to_c_policy(bounds_policy) + bounds_policy ) ) return Table.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd index af0de6ba446..80baa484be7 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd @@ -3,64 +3,13 @@ from libc.stdint cimport int32_t from libcpp cimport bool as cbool -from cudf._lib.cpp.types cimport data_type, interpolation, null_policy, type_id - -ctypedef int32_t underlying_type_t_type_id - - -# Enum representing possible data type ids. This is the Cython representation -# of libcudf's type_id. -cpdef enum TypeId: - EMPTY = type_id.EMPTY - INT8 = type_id.INT8 - INT16 = type_id.INT16 - INT32 = type_id.INT32 - INT64 = type_id.INT64 - UINT8 = type_id.UINT8 - UINT16 = type_id.UINT16 - UINT32 = type_id.UINT32 - UINT64 = type_id.UINT64 - FLOAT32 = type_id.FLOAT32 - FLOAT64 = type_id.FLOAT64 - BOOL8 = type_id.BOOL8 - TIMESTAMP_DAYS = type_id.TIMESTAMP_DAYS - TIMESTAMP_SECONDS = type_id.TIMESTAMP_SECONDS - TIMESTAMP_MILLISECONDS = ( - type_id.TIMESTAMP_MILLISECONDS - ) - TIMESTAMP_MICROSECONDS = ( - type_id.TIMESTAMP_MICROSECONDS - ) - TIMESTAMP_NANOSECONDS = ( - type_id.TIMESTAMP_NANOSECONDS - ) - DICTIONARY32 = type_id.DICTIONARY32 - STRING = type_id.STRING - LIST = type_id.LIST - STRUCT = type_id.STRUCT - NUM_TYPE_IDS = type_id.NUM_TYPE_IDS - DURATION_SECONDS = type_id.DURATION_SECONDS - DURATION_MILLISECONDS = ( - type_id.DURATION_MILLISECONDS - ) - DURATION_MICROSECONDS = ( - type_id.DURATION_MICROSECONDS - ) - DURATION_NANOSECONDS = ( - type_id.DURATION_NANOSECONDS - ) - DECIMAL32 = type_id.DECIMAL32 - DECIMAL64 = type_id.DECIMAL64 - DECIMAL128 = type_id.DECIMAL128 - - -cdef type_id py_type_to_c_type(TypeId py_type_id) nogil +from cudf._lib.cpp.types cimport data_type, type_id cdef class DataType: cdef data_type c_obj - cpdef TypeId id(self) + cpdef type_id id(self) cpdef int32_t scale(self) @staticmethod diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index 23d24182ac4..b1391723f0e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -2,11 +2,9 @@ from libc.stdint cimport int32_t -from cudf._lib.cpp.types cimport type_id +from cudf._lib.cpp.types cimport data_type, type_id - -cdef type_id py_type_to_c_type(TypeId py_type_id) nogil: - return ( py_type_id) +from cudf._lib.cpp.types import type_id as TypeId # no-cython-lint cdef class DataType: @@ -21,13 +19,13 @@ cdef class DataType: scale : int The scale associated with the data. Only used for decimal data types. """ - def __cinit__(self, TypeId id, int32_t scale=0): - self.c_obj = data_type(py_type_to_c_type(id), scale) + def __cinit__(self, type_id id, int32_t scale=0): + self.c_obj = data_type(id, scale) # TODO: Consider making both id and scale cached properties. - cpdef TypeId id(self): + cpdef type_id id(self): """Get the id associated with this data type.""" - return TypeId(self.c_obj.id()) + return self.c_obj.id() cpdef int32_t scale(self): """Get the scale associated with this data type.""" @@ -42,6 +40,6 @@ cdef class DataType: (even direct pylibcudf Cython users). """ # Spoof an empty data type then swap in the real one. - cdef DataType ret = DataType.__new__(DataType, TypeId.EMPTY) + cdef DataType ret = DataType.__new__(DataType, type_id.EMPTY) ret.c_obj = dt return ret diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 39a1b0609cf..0407785b2d8 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -200,23 +200,23 @@ cdef class DeviceScalar: if dtype is not None: s._dtype = dtype elif cdtype.id() in { - libcudf_types.DECIMAL32, - libcudf_types.DECIMAL64, - libcudf_types.DECIMAL128, + libcudf_types.type_id.DECIMAL32, + libcudf_types.type_id.DECIMAL64, + libcudf_types.type_id.DECIMAL128, }: raise TypeError( "Must pass a dtype when constructing from a fixed-point scalar" ) - elif cdtype.id() == libcudf_types.STRUCT: + elif cdtype.id() == libcudf_types.type_id.STRUCT: struct_table_view = (s.get_raw_ptr())[0].view() s._dtype = StructDtype({ str(i): dtype_from_column_view(struct_table_view.column(i)) for i in range(struct_table_view.num_columns()) }) - elif cdtype.id() == libcudf_types.LIST: + elif cdtype.id() == libcudf_types.type_id.LIST: if ( s.get_raw_ptr() - )[0].view().type().id() == libcudf_types.LIST: + )[0].view().type().id() == libcudf_types.type_id.LIST: s._dtype = dtype_from_column_view( (s.get_raw_ptr())[0].view() ) @@ -442,27 +442,27 @@ cdef _get_np_scalar_from_numeric(unique_ptr[scalar]& s): cdef libcudf_types.data_type cdtype = s_ptr[0].type() - if cdtype.id() == libcudf_types.INT8: + if cdtype.id() == libcudf_types.type_id.INT8: return np.int8((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.INT16: + elif cdtype.id() == libcudf_types.type_id.INT16: return np.int16((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.INT32: + elif cdtype.id() == libcudf_types.type_id.INT32: return np.int32((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.INT64: + elif cdtype.id() == libcudf_types.type_id.INT64: return np.int64((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.UINT8: + elif cdtype.id() == libcudf_types.type_id.UINT8: return np.uint8((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.UINT16: + elif cdtype.id() == libcudf_types.type_id.UINT16: return np.uint16((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.UINT32: + elif cdtype.id() == libcudf_types.type_id.UINT32: return np.uint32((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.UINT64: + elif cdtype.id() == libcudf_types.type_id.UINT64: return np.uint64((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.FLOAT32: + elif cdtype.id() == libcudf_types.type_id.FLOAT32: return np.float32((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.FLOAT64: + elif cdtype.id() == libcudf_types.type_id.FLOAT64: return np.float64((s_ptr)[0].value()) - elif cdtype.id() == libcudf_types.BOOL8: + elif cdtype.id() == libcudf_types.type_id.BOOL8: return np.bool_((s_ptr)[0].value()) else: raise ValueError("Could not convert cudf::scalar to numpy scalar") @@ -475,15 +475,15 @@ cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s): cdef libcudf_types.data_type cdtype = s_ptr[0].type() - if cdtype.id() == libcudf_types.DECIMAL64: + if cdtype.id() == libcudf_types.type_id.DECIMAL64: rep_val = int((s_ptr)[0].value()) scale = int((s_ptr)[0].type().scale()) return decimal.Decimal(rep_val).scaleb(scale) - elif cdtype.id() == libcudf_types.DECIMAL32: + elif cdtype.id() == libcudf_types.type_id.DECIMAL32: rep_val = int((s_ptr)[0].value()) scale = int((s_ptr)[0].type().scale()) return decimal.Decimal(rep_val).scaleb(scale) - elif cdtype.id() == libcudf_types.DECIMAL128: + elif cdtype.id() == libcudf_types.type_id.DECIMAL128: rep_val = int((s_ptr)[0].value()) scale = int((s_ptr)[0].type().scale()) return decimal.Decimal(rep_val).scaleb(scale) @@ -499,28 +499,28 @@ cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s): cdef libcudf_types.data_type cdtype = s_ptr[0].type() - if cdtype.id() == libcudf_types.TIMESTAMP_SECONDS: + if cdtype.id() == libcudf_types.type_id.TIMESTAMP_SECONDS: return np.datetime64( ( s_ptr )[0].ticks_since_epoch_64(), datetime_unit_map[(cdtype.id())] ) - elif cdtype.id() == libcudf_types.TIMESTAMP_MILLISECONDS: + elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MILLISECONDS: return np.datetime64( ( s_ptr )[0].ticks_since_epoch_64(), datetime_unit_map[(cdtype.id())] ) - elif cdtype.id() == libcudf_types.TIMESTAMP_MICROSECONDS: + elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MICROSECONDS: return np.datetime64( ( s_ptr )[0].ticks_since_epoch_64(), datetime_unit_map[(cdtype.id())] ) - elif cdtype.id() == libcudf_types.TIMESTAMP_NANOSECONDS: + elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_NANOSECONDS: return np.datetime64( ( s_ptr @@ -540,28 +540,28 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s): cdef libcudf_types.data_type cdtype = s_ptr[0].type() - if cdtype.id() == libcudf_types.DURATION_SECONDS: + if cdtype.id() == libcudf_types.type_id.DURATION_SECONDS: return np.timedelta64( ( s_ptr )[0].ticks(), duration_unit_map[(cdtype.id())] ) - elif cdtype.id() == libcudf_types.DURATION_MILLISECONDS: + elif cdtype.id() == libcudf_types.type_id.DURATION_MILLISECONDS: return np.timedelta64( ( s_ptr )[0].ticks(), duration_unit_map[(cdtype.id())] ) - elif cdtype.id() == libcudf_types.DURATION_MICROSECONDS: + elif cdtype.id() == libcudf_types.type_id.DURATION_MICROSECONDS: return np.timedelta64( ( s_ptr )[0].ticks(), duration_unit_map[(cdtype.id())] ) - elif cdtype.id() == libcudf_types.DURATION_NANOSECONDS: + elif cdtype.id() == libcudf_types.type_id.DURATION_NANOSECONDS: return np.timedelta64( ( s_ptr diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx index 177cbffddb0..2085d5c2896 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import cudf @@ -15,7 +15,7 @@ from cudf._lib.cpp.strings.convert.convert_fixed_point cimport ( is_fixed_point as cpp_is_fixed_point, to_fixed_point as cpp_to_fixed_point, ) -from cudf._lib.cpp.types cimport DECIMAL32, DECIMAL64, DECIMAL128, data_type +from cudf._lib.cpp.types cimport data_type, type_id @acquire_spill_lock() @@ -61,11 +61,11 @@ def to_decimal(Column input_col, object out_type): cdef int scale = out_type.scale cdef data_type c_out_type if isinstance(out_type, cudf.Decimal32Dtype): - c_out_type = data_type(DECIMAL32, -scale) + c_out_type = data_type(type_id.DECIMAL32, -scale) elif isinstance(out_type, cudf.Decimal64Dtype): - c_out_type = data_type(DECIMAL64, -scale) + c_out_type = data_type(type_id.DECIMAL64, -scale) elif isinstance(out_type, cudf.Decimal128Dtype): - c_out_type = data_type(DECIMAL128, -scale) + c_out_type = data_type(type_id.DECIMAL128, -scale) else: raise TypeError("should be a decimal dtype") with nogil: @@ -100,7 +100,7 @@ def is_fixed_point(Column input_col, object dtype): cdef unique_ptr[column] c_result cdef column_view source_view = input_col.view() cdef int scale = dtype.scale - cdef data_type c_dtype = data_type(DECIMAL64, -scale) + cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale) with nogil: c_result = move(cpp_is_fixed_point( source_view, diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 8594e37ac4a..929f8b447ab 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -236,6 +236,7 @@ cdef dtype_from_column_view(column_view cv): ] cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: + cdef libcudf_types.type_id tid if cudf.api.types.is_list_dtype(dtype): tid = libcudf_types.type_id.LIST elif cudf.api.types.is_struct_dtype(dtype):