diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f8c4f4b9143..d0457d2c641 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -149,7 +149,7 @@ repos: - id: ruff-format files: python/.*$ - repo: https://github.com/rapidsai/pre-commit-hooks - rev: v0.0.3 + rev: v0.2.0 hooks: - id: verify-copyright exclude: | @@ -158,6 +158,7 @@ repos: cpp/src/io/parquet/ipc/Message_generated[.]h$| cpp/src/io/parquet/ipc/Schema_generated[.]h$ ) + - id: verify-alpha-spec default_language_version: python: python3 diff --git a/ci/test_cudf_polars.sh b/ci/test_cudf_polars.sh index 669e049ab26..95fb4b431bf 100755 --- a/ci/test_cudf_polars.sh +++ b/ci/test_cudf_polars.sh @@ -28,10 +28,8 @@ rapids-logger "Install cudf wheel" # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install $(echo ./dist/cudf*.whl)[test] -rapids-logger "Install polars (allow pre-release versions)" -python -m pip install 'polars>=1.0.0a0' - rapids-logger "Install cudf_polars" +python -m pip install 'polars>=1.0' python -m pip install --no-deps python/cudf_polars rapids-logger "Run cudf_polars tests" diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 946e2d1cd32..cc9238ab80a 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -26,7 +26,6 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==24.8.* - dask-cuda==24.8.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 @@ -44,10 +43,10 @@ dependencies: - libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 -- libkvikio==24.8.* +- libkvikio==24.8.*,>=0.0.0a0 - libparquet==16.1.0.* - librdkafka>=1.9.0,<1.10.0a0 -- librmm==24.8.* +- librmm==24.8.*,>=0.0.0a0 - make - moto>=4.0.8 - msgpack-python diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index f069616ddbe..9fecd452248 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -27,7 +27,6 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==24.8.* - dask-cuda==24.8.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 @@ -43,10 +42,10 @@ dependencies: - libarrow==16.1.0.* - libcufile-dev - libcurand-dev -- libkvikio==24.8.* +- libkvikio==24.8.*,>=0.0.0a0 - libparquet==16.1.0.* - librdkafka>=1.9.0,<1.10.0a0 -- librmm==24.8.* +- librmm==24.8.*,>=0.0.0a0 - make - moto>=4.0.8 - msgpack-python @@ -66,7 +65,7 @@ dependencies: - pre-commit - pyarrow==16.1.0.* - pydata-sphinx-theme!=0.14.2 -- pynvjitlink +- pynvjitlink>=0.0.0a0 - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 35cf90411f2..54070ab6f5a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -365,6 +365,7 @@ add_library( src/interop/to_arrow_device.cu src/interop/from_arrow_device.cu src/interop/from_arrow_host.cu + src/interop/from_arrow_stream.cu src/interop/to_arrow_schema.cpp src/interop/detail/arrow_allocator.cpp src/io/avro/avro.cpp diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 56ec62fa6e1..502ffb9ba4f 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -50,6 +50,8 @@ struct ArrowSchema; struct ArrowArray; +struct ArrowArrayStream; + namespace cudf { /** * @addtogroup interop_dlpack @@ -367,10 +369,11 @@ std::unique_ptr from_arrow( * @param mr Device memory resource used to allocate `cudf::table` * @return cudf table generated from given arrow data */ -std::unique_ptr from_arrow(ArrowSchema const* schema, - ArrowArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr from_arrow( + ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input @@ -385,10 +388,11 @@ std::unique_ptr from_arrow(ArrowSchema const* schema, * @param mr Device memory resource used to allocate `cudf::column` * @return cudf column generated from given arrow data */ -std::unique_ptr from_arrow_column(ArrowSchema const* schema, - ArrowArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr from_arrow_column( + ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::table` from given ArrowDeviceArray input @@ -414,6 +418,24 @@ std::unique_ptr from_arrow_host( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create `cudf::table` from given ArrowArrayStream input + * + * @throws std::invalid_argument if input is NULL + * + * The conversion WILL release the input ArrayArrayStream and its constituent + * arrays or schema since Arrow streams are not suitable for multiple reads. + * + * @param input `ArrowArrayStream` pointer to object that will produce ArrowArray data + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to perform cuda allocation + * @return cudf table generated from the given Arrow data + */ +std::unique_ptr
from_arrow_stream( + ArrowArrayStream* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Create `cudf::column` from given ArrowDeviceArray input * diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp index 0086432d003..60ee867f058 100644 --- a/cpp/include/cudf/io/text/byte_range_info.hpp +++ b/cpp/include/cudf/io/text/byte_range_info.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,17 +24,22 @@ namespace cudf { namespace io { namespace text { +/** + * @addtogroup io_readers + * @{ + * @file + */ /** * @brief stores offset and size used to indicate a byte range */ class byte_range_info { private: - int64_t _offset; ///< offset in bytes - int64_t _size; ///< size in bytes + int64_t _offset{}; ///< offset in bytes + int64_t _size{}; ///< size in bytes public: - constexpr byte_range_info() noexcept : _offset(0), _size(0) {} + constexpr byte_range_info() = default; /** * @brief Constructs a byte_range_info object * @@ -104,6 +109,8 @@ std::vector create_byte_range_infos_consecutive(int64_t total_b */ byte_range_info create_byte_range_info_max(); +/** @} */ // end of group + } // namespace text } // namespace io } // namespace cudf diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index 28204c82780..13aff4b3b8f 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,12 @@ namespace cudf { namespace io { namespace text { +/** + * @addtogroup io_readers + * @{ + * @file + */ + /** * @brief A contract guaranteeing stream-ordered memory access to the underlying device data. * @@ -110,6 +116,8 @@ class data_chunk_source { [[nodiscard]] virtual std::unique_ptr create_reader() const = 0; }; +/** @} */ // end of group + } // namespace text } // namespace io } // namespace cudf diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index 7abae7c754b..e29ab78ae46 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -30,6 +30,11 @@ namespace cudf { namespace io { namespace text { +/** + * @addtogroup io_readers + * @{ + * @file + */ /** * @brief Parsing options for multibyte_split. @@ -79,6 +84,7 @@ struct parse_options { * @param source The source string * @param delimiter UTF-8 encoded string for which to find offsets in the source * @param options the parsing options to use (including byte range) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Memory resource to use for the device memory allocation * @return The strings found by splitting the source by the delimiter within the relevant byte * range. @@ -87,17 +93,30 @@ std::unique_ptr multibyte_split( data_chunk_source const& source, std::string const& delimiter, parse_options options = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); -std::unique_ptr multibyte_split( +/** + * @brief Splits the source text into a strings column using a multiple byte delimiter. + * + * @deprecated Since 24.08 + * + * @param source The source input data encoded in UTF-8 + * @param delimiter UTF-8 encoded string for which to find offsets in the source + * @param byte_range The position and size within `source` to produce the column from + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Memory resource to use for the device memory allocation + * @return The strings found by splitting the source by the delimiter within the relevant byte + * range. + */ +[[deprecated]] std::unique_ptr multibyte_split( data_chunk_source const& source, std::string const& delimiter, std::optional byte_range, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); -std::unique_ptr multibyte_split(data_chunk_source const& source, - std::string const& delimiter, - rmm::device_async_resource_ref mr); +/** @} */ // end of group } // namespace text } // namespace io diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index e9b81a525fc..c181ac7d402 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -252,7 +252,7 @@ using optional_dremel_view = thrust::optional; * * @tparam has_nested_columns compile-time optimization for primitive types. * This template parameter is to be used by the developer by querying - * `cudf::detail::has_nested_columns(input)`. `true` compiles operator + * `cudf::has_nested_columns(input)`. `true` compiles operator * overloads for nested types, while `false` only compiles operator * overloads for primitive types. * @tparam Nullate A cudf::nullate type describing whether to check for nulls. @@ -1014,7 +1014,7 @@ class self_comparator { * * @tparam has_nested_columns compile-time optimization for primitive types. * This template parameter is to be used by the developer by querying - * `cudf::detail::has_nested_columns(input)`. `true` compiles operator + * `cudf::has_nested_columns(input)`. `true` compiles operator * overloads for nested types, while `false` only compiles operator * overloads for primitive types. * @tparam Nullate A cudf::nullate type describing whether to check for nulls. @@ -1186,7 +1186,7 @@ class two_table_comparator { * * @tparam has_nested_columns compile-time optimization for primitive types. * This template parameter is to be used by the developer by querying - * `cudf::detail::has_nested_columns(input)`. `true` compiles operator + * `cudf::has_nested_columns(input)`. `true` compiles operator * overloads for nested types, while `false` only compiles operator * overloads for primitive types. * @tparam Nullate A cudf::nullate type describing whether to check for nulls. @@ -1326,7 +1326,7 @@ struct nan_equal_physical_equality_comparator { * * @tparam has_nested_columns compile-time optimization for primitive types. * This template parameter is to be used by the developer by querying - * `cudf::detail::has_nested_columns(input)`. `true` compiles operator + * `cudf::has_nested_columns(input)`. `true` compiles operator * overloads for nested types, while `false` only compiles operator * overloads for primitive types. * @tparam Nullate A cudf::nullate type describing whether to check for nulls. @@ -1643,7 +1643,7 @@ class self_comparator { * * @tparam has_nested_columns compile-time optimization for primitive types. * This template parameter is to be used by the developer by querying - * `cudf::detail::has_nested_columns(input)`. `true` compiles operator + * `cudf::has_nested_columns(input)`. `true` compiles operator * overloads for nested types, while `false` only compiles operator * overloads for primitive types. * @tparam Nullate A cudf::nullate type describing whether to check for nulls. @@ -1757,7 +1757,7 @@ class two_table_comparator { * * @tparam has_nested_columns compile-time optimization for primitive types. * This template parameter is to be used by the developer by querying - * `cudf::detail::has_nested_columns(input)`. `true` compiles operator + * `cudf::has_nested_columns(input)`. `true` compiles operator * overloads for nested types, while `false` only compiles operator * overloads for primitive types. * @tparam Nullate A cudf::nullate type describing whether to check for nulls. diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index a71e0558dec..4a990f67ce4 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -32,7 +33,7 @@ * passed by value. */ -namespace cudf { +namespace CUDF_EXPORT cudf { namespace detail { /** * @brief Base class for a table of `ColumnView`s @@ -123,7 +124,10 @@ class table_view_base { * @param column_index The index of the desired column * @return A reference to the desired column */ - [[nodiscard]] ColumnView const& column(size_type column_index) const; + [[nodiscard]] ColumnView const& column(size_type column_index) const + { + return _columns.at(column_index); + } /** * @brief Returns the number of columns @@ -174,8 +178,17 @@ class table_view_base { * @return Whether nested columns exist in the input table */ bool has_nested_columns(table_view const& table); + } // namespace detail +/** + * @brief Determine if any nested columns exist in a given table. + * + * @param table The input table + * @return Whether nested columns exist in the input table + */ +bool has_nested_columns(table_view const& table); + /** * @brief A set of cudf::column_view's of the same size. * @@ -374,4 +387,4 @@ extern template bool is_relationally_comparable(mutable_tabl mutable_table_view const& rhs); // @endcond } // namespace detail -} // namespace cudf +} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index 47e74a5cb48..6acbafd24fb 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -463,10 +463,6 @@ void traverse_children::operator()(host_span */ void bounds_and_type_check(host_span cols, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()), - "Type mismatch in columns to concatenate.", - cudf::data_type_error); - // total size of all concatenated rows size_t const total_row_count = std::accumulate(cols.begin(), cols.end(), std::size_t{}, [](size_t a, auto const& b) { @@ -476,6 +472,21 @@ void bounds_and_type_check(host_span cols, rmm::cuda_stream_v "Total number of concatenated rows exceeds the column size limit", std::overflow_error); + if (std::any_of(cols.begin(), cols.end(), [](column_view const& c) { + return c.type().id() == cudf::type_id::EMPTY; + })) { + CUDF_EXPECTS( + std::all_of(cols.begin(), + cols.end(), + [](column_view const& c) { return c.type().id() == cudf::type_id::EMPTY; }), + "Mismatch in columns to concatenate.", + cudf::data_type_error); + return; + } + CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()), + "Type mismatch in columns to concatenate.", + cudf::data_type_error); + // traverse children cudf::type_dispatcher(cols.front().type(), traverse_children{}, cols, stream); } @@ -498,6 +509,15 @@ std::unique_ptr concatenate(host_span columns_to_conc return empty_like(columns_to_concat.front()); } + // For empty columns, we can just create an EMPTY column of the appropriate length. + if (columns_to_concat.front().type().id() == cudf::type_id::EMPTY) { + auto length = std::accumulate( + columns_to_concat.begin(), columns_to_concat.end(), 0, [](auto a, auto const& b) { + return a + b.size(); + }); + return std::make_unique( + data_type(type_id::EMPTY), length, rmm::device_buffer{}, rmm::device_buffer{}, length); + } return type_dispatcher( columns_to_concat.front().type(), concatenate_dispatch{columns_to_concat, stream, mr}); } diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp index dd9e9600a87..605d813ed1e 100644 --- a/cpp/src/interop/arrow_utilities.cpp +++ b/cpp/src/interop/arrow_utilities.cpp @@ -39,7 +39,8 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view) case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32); case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64); case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS); - case NANOARROW_TYPE_STRING: return data_type(type_id::STRING); + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: return data_type(type_id::STRING); case NANOARROW_TYPE_LIST: return data_type(type_id::LIST); case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32); case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT); diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 002a8ec1f14..73c1a474310 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -143,6 +143,9 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING, + "Large strings are not yet supported in from_arrow_device", + cudf::data_type_error); if (input->length == 0) { return std::make_tuple( {type, diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 854a1d68fdc..b7e07056686 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -188,8 +188,16 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()(offset_buffers[1])[input->length + input->offset]; + int64_t const char_data_length = [&]() { + if (schema->type == NANOARROW_TYPE_LARGE_STRING) { + return reinterpret_cast(offset_buffers[1])[input->length + input->offset]; + } else if (schema->type == NANOARROW_TYPE_STRING) { + return static_cast( + reinterpret_cast(offset_buffers[1])[input->length + input->offset]); + } else { + CUDF_FAIL("Unsupported string type", cudf::data_type_error); + } + }(); void const* char_buffers[2] = {nullptr, input->buffers[2]}; ArrowArray char_array = { .length = char_data_length, @@ -210,15 +218,27 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()operator()(&view, &offsets_array, data_type(type_id::INT32), true); + auto offsets_column = [&]() { + if (schema->type == NANOARROW_TYPE_LARGE_STRING) { + return this->operator()(&view, &offsets_array, data_type(type_id::INT64), true); + } else if (schema->type == NANOARROW_TYPE_STRING) { + return this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); + } else { + CUDF_FAIL("Unsupported string type", cudf::data_type_error); + } + }(); NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr)); - auto chars_column = this->operator()(&view, &char_array, data_type(type_id::INT8), true); + rmm::device_buffer chars(char_data_length, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(), + reinterpret_cast(char_array.buffers[1]), + chars.size(), + cudaMemcpyDefault, + stream.value())); auto const num_rows = offsets_column->size() - 1; auto out_col = make_strings_column(num_rows, std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + std::move(chars), input->null_count, std::move(*get_mask_buffer(input))); diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu new file mode 100644 index 00000000000..0c85b561944 --- /dev/null +++ b/cpp/src/interop/from_arrow_stream.cu @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arrow_utilities.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +namespace cudf { +namespace detail { + +namespace { + +std::unique_ptr make_empty_column_from_schema(ArrowSchema const* schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + ArrowSchemaView schema_view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr)); + + auto const type{arrow_to_cudf_type(&schema_view)}; + switch (type.id()) { + case type_id::EMPTY: { + return std::make_unique( + data_type(type_id::EMPTY), 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); + } + case type_id::LIST: { + return cudf::make_lists_column(0, + cudf::make_empty_column(data_type{type_id::INT32}), + make_empty_column_from_schema(schema->children[0], stream, mr), + 0, + {}, + stream, + mr); + } + case type_id::STRUCT: { + std::vector> child_columns; + child_columns.reserve(schema->n_children); + std::transform( + schema->children, + schema->children + schema->n_children, + std::back_inserter(child_columns), + [&](auto const& child) { return make_empty_column_from_schema(child, stream, mr); }); + return cudf::make_structs_column(0, std::move(child_columns), 0, {}, stream, mr); + } + default: { + return cudf::make_empty_column(type); + } + } +} + +} // namespace + +std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(input != nullptr, "input ArrowArrayStream must not be NULL", std::invalid_argument); + + // Potential future optimization: Since the from_arrow API accepts an + // ArrowSchema we're allocating one here instead of using a view, which we + // could avoid with a different underlying implementation. + ArrowSchema schema; + NANOARROW_THROW_NOT_OK(ArrowArrayStreamGetSchema(input, &schema, nullptr)); + + std::vector> chunks; + ArrowArray chunk; + while (true) { + NANOARROW_THROW_NOT_OK(ArrowArrayStreamGetNext(input, &chunk, nullptr)); + if (chunk.release == nullptr) { break; } + chunks.push_back(from_arrow(&schema, &chunk, stream, mr)); + chunk.release(&chunk); + } + input->release(input); + + if (chunks.empty()) { + if (schema.n_children == 0) { + schema.release(&schema); + return std::make_unique(); + } + + // If there are no chunks but the schema has children, we need to construct a suitable empty + // table. + std::vector> columns; + columns.reserve(chunks.size()); + std::transform( + schema.children, + schema.children + schema.n_children, + std::back_inserter(columns), + [&](auto const& child) { return make_empty_column_from_schema(child, stream, mr); }); + schema.release(&schema); + return std::make_unique(std::move(columns)); + } + + schema.release(&schema); + + auto chunk_views = std::vector{}; + chunk_views.reserve(chunks.size()); + std::transform( + chunks.begin(), chunks.end(), std::back_inserter(chunk_views), [](auto const& chunk) { + return chunk->view(); + }); + return cudf::detail::concatenate(chunk_views, stream, mr); +} + +} // namespace detail + +std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::from_arrow_stream(input, stream, mr); +} +} // namespace cudf diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 9c406369068..51dc0ca90af 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -565,35 +565,32 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source } // namespace detail +// deprecated in 24.08 std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, std::string const& delimiter, std::optional byte_range, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return multibyte_split( - source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr); + return multibyte_split(source, + delimiter, + parse_options{byte_range.value_or(create_byte_range_info_max())}, + stream, + mr); } std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, std::string const& delimiter, parse_options options, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto stream = cudf::get_default_stream(); - auto result = detail::multibyte_split( source, delimiter, options.byte_range, options.strip_delimiters, stream, mr); return result; } -std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, - std::string const& delimiter, - rmm::device_async_resource_ref mr) -{ - return multibyte_split(source, delimiter, parse_options{}, mr); -} - } // namespace text } // namespace io } // namespace cudf diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp index 13832b0d9dc..8a5340dc20d 100644 --- a/cpp/src/table/table_view.cpp +++ b/cpp/src/table/table_view.cpp @@ -52,12 +52,6 @@ auto concatenate_column_views(std::vector const& views) return concat_cols; } -template -ColumnView const& table_view_base::column(size_type column_index) const -{ - return _columns.at(column_index); -} - // Explicit instantiation for a table of `column_view`s template class table_view_base; @@ -172,6 +166,7 @@ bool has_nested_columns(table_view const& table) return std::any_of( table.begin(), table.end(), [](column_view const& col) { return is_nested(col.type()); }); } - } // namespace detail + +bool has_nested_columns(table_view const& table) { return detail::has_nested_columns(table); } } // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index eef09954647..0eab9ba61d8 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -273,6 +273,7 @@ ConfigureTest( interop/from_arrow_test.cpp interop/from_arrow_device_test.cpp interop/from_arrow_host_test.cpp + interop/from_arrow_stream_test.cpp interop/dlpack_test.cpp EXTRA_LIB nanoarrow @@ -691,6 +692,7 @@ ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_MULTIBYTE_SPLIT_TEST streams/io/multibyte_split_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 054441788d0..18140c34abd 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -1667,3 +1667,63 @@ TEST_F(DictionaryConcatTest, ErrorsTest) std::vector empty; EXPECT_THROW(cudf::concatenate(empty), cudf::logic_error); } + +struct EmptyColumnTest : public cudf::test::BaseFixture {}; + +TEST_F(EmptyColumnTest, SimpleTest) +{ + std::vector columns; + constexpr auto num_copies = 10; + constexpr auto num_rows = 10; + for (auto i = 0; i < num_copies; ++i) { + columns.emplace_back(cudf::data_type(cudf::type_id::EMPTY), + num_rows, + rmm::device_buffer{}, + rmm::device_buffer{}, + 0); + } + + // Create views from columns + std::vector views; + for (auto& col : columns) { + views.push_back(col.view()); + } + auto result = cudf::concatenate(views); + + ASSERT_EQ(result->size(), num_copies * num_rows); + ASSERT_EQ(result->type().id(), cudf::type_id::EMPTY); +} + +struct TableOfEmptyColumnsTest : public cudf::test::BaseFixture {}; + +TEST_F(TableOfEmptyColumnsTest, SimpleTest) +{ + std::vector tables; + constexpr auto num_copies = 10; + constexpr auto num_rows = 10; + constexpr auto num_columns = 10; + for (auto i = 0; i < num_copies; ++i) { + std::vector> columns; + for (auto j = 0; j < num_columns; ++j) { + columns.push_back(std::make_unique(cudf::data_type(cudf::type_id::EMPTY), + num_rows, + rmm::device_buffer{}, + rmm::device_buffer{}, + 0)); + } + tables.emplace_back(std::move(columns)); + } + + // Create views from columns + std::vector views; + for (auto& tbl : tables) { + views.push_back(tbl.view()); + } + auto result = cudf::concatenate(views); + + ASSERT_EQ(result->num_rows(), num_copies * num_rows); + ASSERT_EQ(result->num_columns(), num_columns); + for (auto i = 0; i < num_columns; ++i) { + ASSERT_EQ(result->get_column(i).type().id(), cudf::type_id::EMPTY); + } +} diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp new file mode 100644 index 00000000000..418ec057303 --- /dev/null +++ b/cpp/tests/interop/from_arrow_stream_test.cpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nanoarrow_utils.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct VectorOfArrays { + std::vector arrays; + nanoarrow::UniqueSchema schema; + size_t index{0}; + + static int get_schema(ArrowArrayStream* stream, ArrowSchema* out_schema) + { + auto private_data = static_cast(stream->private_data); + ArrowSchemaDeepCopy(private_data->schema.get(), out_schema); + return 0; + } + + static int get_next(ArrowArrayStream* stream, ArrowArray* out_array) + { + auto private_data = static_cast(stream->private_data); + if (private_data->index >= private_data->arrays.size()) { + out_array->release = nullptr; + return 0; + } + ArrowArrayMove(private_data->arrays[private_data->index++].get(), out_array); + return 0; + } + + static const char* get_last_error(ArrowArrayStream* stream) { return nullptr; } + + static void release(ArrowArrayStream* stream) + { + delete static_cast(stream->private_data); + } +}; + +struct FromArrowStreamTest : public cudf::test::BaseFixture {}; + +void makeStreamFromArrays(std::vector arrays, + nanoarrow::UniqueSchema schema, + ArrowArrayStream* out) +{ + auto* private_data = new VectorOfArrays{std::move(arrays), std::move(schema)}; + out->get_schema = VectorOfArrays::get_schema; + out->get_next = VectorOfArrays::get_next; + out->get_last_error = VectorOfArrays::get_last_error; + out->release = VectorOfArrays::release; + out->private_data = private_data; +} + +TEST_F(FromArrowStreamTest, BasicTest) +{ + constexpr auto num_copies = 3; + std::vector> tables; + // The schema is unique across all tables. + nanoarrow::UniqueSchema schema; + std::vector arrays; + for (auto i = 0; i < num_copies; ++i) { + auto [tbl, sch, arr] = get_nanoarrow_host_tables(0); + tables.push_back(std::move(tbl)); + arrays.push_back(std::move(arr)); + if (i == 0) { sch.move(schema.get()); } + } + std::vector table_views; + for (auto const& table : tables) { + table_views.push_back(table->view()); + } + auto expected = cudf::concatenate(table_views); + + ArrowArrayStream stream; + makeStreamFromArrays(std::move(arrays), std::move(schema), &stream); + auto result = cudf::from_arrow_stream(&stream); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result->view()); +} + +TEST_F(FromArrowStreamTest, EmptyTest) +{ + auto [tbl, sch, arr] = get_nanoarrow_host_tables(0); + std::vector table_views{tbl->view()}; + auto expected = cudf::concatenate(table_views); + + ArrowArrayStream stream; + makeStreamFromArrays({}, std::move(sch), &stream); + auto result = cudf::from_arrow_stream(&stream); + cudf::have_same_types(expected->view(), result->view()); +} diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index 94c4372e74a..4147728b2a6 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -375,3 +375,6 @@ nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list data, std::tuple, nanoarrow::UniqueSchema, generated_test_data> get_nanoarrow_cudf_table(cudf::size_type length); + +std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> +get_nanoarrow_host_tables(cudf::size_type length); diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 36338253c9b..408d54bd5ff 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -97,10 +97,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange) auto expected = strings_column_wrapper{"abcdefg:"}; auto source = cudf::io::text::make_source(host_input); - auto out = cudf::io::text::multibyte_split( - *source, - delimiter, - cudf::io::text::byte_range_info{0, static_cast(host_input.size())}); + cudf::io::text::parse_options options{ + cudf::io::text::byte_range_info{0, static_cast(host_input.size())}}; + auto out = cudf::io::text::multibyte_split(*source, delimiter, options); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } @@ -113,10 +112,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange2) auto expected = strings_column_wrapper{"abcdefg:"}; auto source = cudf::io::text::make_source(host_input); - auto out = cudf::io::text::multibyte_split( - *source, - delimiter, - cudf::io::text::byte_range_info{0, static_cast(host_input.size() - 1)}); + cudf::io::text::parse_options options{ + cudf::io::text::byte_range_info{0, static_cast(host_input.size() - 1)}}; + auto out = cudf::io::text::multibyte_split(*source, delimiter, options); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } @@ -277,9 +275,12 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRange) auto source = cudf::io::text::make_source(host_input); auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3); - auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]); - auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]); - auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]); + auto out0 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]}); + auto out1 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]}); + auto out2 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]}); auto out_views = std::vector({out0->view(), out1->view(), out2->view()}); auto out = cudf::concatenate(out_views); @@ -303,9 +304,12 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRange) auto source = cudf::io::text::make_source(host_input); auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3); - auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]); - auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]); - auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]); + auto out0 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]}); + auto out1 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]}); + auto out2 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]}); auto out_views = std::vector({out0->view(), out1->view(), out2->view()}); auto out = cudf::concatenate(out_views); @@ -327,9 +331,12 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRangeSingleByte) auto source = cudf::io::text::make_source(host_input); auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3); - auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]); - auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]); - auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]); + auto out0 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]}); + auto out1 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]}); + auto out2 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]}); auto out_views = std::vector({out0->view(), out1->view(), out2->view()}); auto out = cudf::concatenate(out_views); @@ -352,9 +359,12 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRangeSingleByte) auto source = cudf::io::text::make_source(host_input); auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3); - auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]); - auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]); - auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]); + auto out0 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]}); + auto out1 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]}); + auto out2 = cudf::io::text::multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]}); auto out_views = std::vector({out0->view(), out1->view(), out2->view()}); auto out = cudf::concatenate(out_views); @@ -383,9 +393,14 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRanges) SCOPED_TRACE(split1); for (int split2 = split1 + 1; split2 < size; split2++) { SCOPED_TRACE(split2); - auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1}); - auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1}); - auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2}); + auto out1 = multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_range_info{0, split1}}); + auto out2 = + multibyte_split(*source, + delimiter, + cudf::io::text::parse_options{byte_range_info{split1, split2 - split1}}); + auto out3 = multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_range_info{split2, size - split2}}); auto out_views = std::vector({out1->view(), out2->view(), out3->view()}); auto out = cudf::concatenate(out_views); @@ -416,9 +431,14 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte) SCOPED_TRACE(split1); for (int split2 = split1 + 1; split2 < size; split2++) { SCOPED_TRACE(split2); - auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1}); - auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1}); - auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2}); + auto out1 = multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_range_info{0, split1}}); + auto out2 = + multibyte_split(*source, + delimiter, + cudf::io::text::parse_options{byte_range_info{split1, split2 - split1}}); + auto out3 = multibyte_split( + *source, delimiter, cudf::io::text::parse_options{byte_range_info{split2, size - split2}}); auto out_views = std::vector({out1->view(), out2->view(), out3->view()}); auto out = cudf::concatenate(out_views); @@ -441,7 +461,8 @@ TEST_F(MultibyteSplitTest, SingletonRangeAtEnd) auto source = make_source(host_input); auto expected = strings_column_wrapper{}; - auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1}); + auto out = + multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{5, 1}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS); } @@ -480,7 +501,8 @@ TEST_F(MultibyteSplitTest, EmptyRange) auto source = make_source(host_input); auto expected = strings_column_wrapper{}; - auto out = multibyte_split(*source, delimiter, byte_range_info{4, 0}); + auto out = + multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{4, 0}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS); } @@ -493,7 +515,8 @@ TEST_F(MultibyteSplitTest, EmptyRangeSingleByte) auto source = make_source(host_input); auto expected = strings_column_wrapper{}; - auto out = multibyte_split(*source, delimiter, byte_range_info{3, 0}); + auto out = + multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{3, 0}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS); } diff --git a/cpp/tests/streams/io/multibyte_split_test.cpp b/cpp/tests/streams/io/multibyte_split_test.cpp new file mode 100644 index 00000000000..b0eff1d3340 --- /dev/null +++ b/cpp/tests/streams/io/multibyte_split_test.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include + +class MultibyteSplitTest : public cudf::test::BaseFixture {}; + +TEST_F(MultibyteSplitTest, Reader) +{ + auto delimiter = std::string(":"); + auto host_input = std::string("abc:def"); + auto source = cudf::io::text::make_source(host_input); + cudf::io::text::parse_options options{}; + auto result = + cudf::io::text::multibyte_split(*source, delimiter, options, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/table/experimental_row_operator_tests.cu b/cpp/tests/table/experimental_row_operator_tests.cu index 896cc7a82d4..0d9e4e27f2c 100644 --- a/cpp/tests/table/experimental_row_operator_tests.cu +++ b/cpp/tests/table/experimental_row_operator_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -109,15 +109,14 @@ TYPED_TEST(TypedTableViewTest, TestSortSameTableFromTwoTables) auto const lhs = cudf::table_view{{col1}}; auto const empty_rhs = cudf::table_view{{col2}}; - auto const stream = cudf::get_default_stream(); - auto const test_sort = [stream](auto const& preprocessed, - auto const& input, - auto const& comparator, - auto const& expected) { - auto const order = sorted_order( - preprocessed, input.num_rows(), cudf::detail::has_nested_columns(input), comparator, stream); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view()); - }; + auto const stream = cudf::get_default_stream(); + auto const test_sort = + [stream]( + auto const& preprocessed, auto const& input, auto const& comparator, auto const& expected) { + auto const order = sorted_order( + preprocessed, input.num_rows(), cudf::has_nested_columns(input), comparator, stream); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view()); + }; auto const test_sort_two_tables = [&](auto const& preprocessed_lhs, auto const& preprocessed_empty_rhs) { @@ -188,15 +187,14 @@ TYPED_TEST(TypedTableViewTest, TestSortSameTableFromTwoTablesWithListsOfStructs) auto const lhs = cudf::table_view{{*col1}}; auto const empty_rhs = cudf::table_view{{*col2}}; - auto const stream = cudf::get_default_stream(); - auto const test_sort = [stream](auto const& preprocessed, - auto const& input, - auto const& comparator, - auto const& expected) { - auto const order = sorted_order( - preprocessed, input.num_rows(), cudf::detail::has_nested_columns(input), comparator, stream); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view()); - }; + auto const stream = cudf::get_default_stream(); + auto const test_sort = + [stream]( + auto const& preprocessed, auto const& input, auto const& comparator, auto const& expected) { + auto const order = sorted_order( + preprocessed, input.num_rows(), cudf::has_nested_columns(input), comparator, stream); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view()); + }; auto const test_sort_two_tables = [&](auto const& preprocessed_lhs, auto const& preprocessed_empty_rhs) { diff --git a/cpp/tests/table/row_operator_tests_utilities.cu b/cpp/tests/table/row_operator_tests_utilities.cu index cfffa1cdd54..6127864987d 100644 --- a/cpp/tests/table/row_operator_tests_utilities.cu +++ b/cpp/tests/table/row_operator_tests_utilities.cu @@ -42,7 +42,7 @@ std::unique_ptr two_table_comparison(cudf::table_view lhs, auto output = cudf::make_numeric_column( cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED); - if (cudf::detail::has_nested_columns(lhs) || cudf::detail::has_nested_columns(rhs)) { + if (cudf::has_nested_columns(lhs) || cudf::has_nested_columns(rhs)) { thrust::transform(rmm::exec_policy(stream), lhs_it, lhs_it + lhs.num_rows(), @@ -129,7 +129,7 @@ std::unique_ptr two_table_equality(cudf::table_view lhs, auto output = cudf::make_numeric_column( cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED); - if (cudf::detail::has_nested_columns(lhs) or cudf::detail::has_nested_columns(rhs)) { + if (cudf::has_nested_columns(lhs) or cudf::has_nested_columns(rhs)) { auto const equal_comparator = table_comparator.equal_to(cudf::nullate::NO{}, cudf::null_equality::EQUAL, comparator); diff --git a/cpp/tests/table/row_operator_tests_utilities2.cu b/cpp/tests/table/row_operator_tests_utilities2.cu index 057d9ee1004..17d274eba13 100644 --- a/cpp/tests/table/row_operator_tests_utilities2.cu +++ b/cpp/tests/table/row_operator_tests_utilities2.cu @@ -41,7 +41,7 @@ std::unique_ptr self_comparison(cudf::table_view input, auto output = cudf::make_numeric_column( cudf::data_type(cudf::type_id::BOOL8), input.num_rows(), cudf::mask_state::UNALLOCATED); - if (cudf::detail::has_nested_columns(input)) { + if (cudf::has_nested_columns(input)) { thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.num_rows()), diff --git a/dependencies.yaml b/dependencies.yaml index 38ec30a8033..e3f8a72e76c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -287,8 +287,8 @@ dependencies: - output_types: conda packages: - fmt>=10.1.1,<11 - - librmm==24.8.* - - libkvikio==24.8.* + - librmm==24.8.*,>=0.0.0a0 + - libkvikio==24.8.*,>=0.0.0a0 - librdkafka>=1.9.0,<1.10.0a0 # Align nvcomp version with rapids-cmake - nvcomp==3.0.6 @@ -500,7 +500,7 @@ dependencies: - output_types: [conda] packages: - breathe>=4.35.0 - - dask-cuda==24.8.* + - dask-cuda==24.8.*,>=0.0.0a0 - *doxygen - make - myst-nb @@ -582,7 +582,7 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - pynvjitlink + - pynvjitlink>=0.0.0a0 - matrix: {cuda: "11.*"} packages: - cubinlinker @@ -592,7 +592,7 @@ dependencies: - matrix: {cuda: "12.*"} packages: - rmm-cu12==24.8.*,>=0.0.0a0 - - pynvjitlink-cu12 + - pynvjitlink-cu12>=0.0.0a0 - matrix: {cuda: "11.*"} packages: - rmm-cu11==24.8.*,>=0.0.0a0 @@ -603,7 +603,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=0.20.30 + - polars>=1.0 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 108f12bc099..c3c14ac8cad 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -372,7 +372,7 @@ def _generate_namespaces(namespaces): _all_namespaces = _generate_namespaces( { # Note that io::datasource is actually a nested class - "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression"}, + "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"}, "numeric": {}, "nvtext": {}, } diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index 07e9d1ead11..adf7e1fd7e8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from cpython cimport pycapsule from cython.operator cimport dereference from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move @@ -11,9 +12,15 @@ from functools import singledispatch from pyarrow import lib as pa +from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.interop cimport ( + ArrowArray, + ArrowArrayStream, + ArrowSchema, column_metadata, from_arrow as cpp_from_arrow, + from_arrow_column as cpp_from_arrow_column, + from_arrow_stream as cpp_from_arrow_stream, to_arrow as cpp_to_arrow, ) from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport ( @@ -124,11 +131,15 @@ def _from_arrow_datatype(pyarrow_object): def _from_arrow_table(pyarrow_object, *, DataType data_type=None): if data_type is not None: raise ValueError("data_type may not be passed for tables") - cdef shared_ptr[pa.CTable] arrow_table = pa.pyarrow_unwrap_table(pyarrow_object) + stream = pyarrow_object.__arrow_c_stream__() + cdef ArrowArrayStream* c_stream = ( + pycapsule.PyCapsule_GetPointer(stream, "arrow_array_stream") + ) cdef unique_ptr[table] c_result with nogil: - c_result = move(cpp_from_arrow(dereference(arrow_table))) + # The libcudf function here will release the stream. + c_result = move(cpp_from_arrow_stream(c_stream)) return Table.from_libcudf(move(c_result)) @@ -190,8 +201,25 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None): def _from_arrow_column(pyarrow_object, *, DataType data_type=None): if data_type is not None: raise ValueError("data_type may not be passed for arrays") - pa_table = pa.table([pyarrow_object], [""]) - return from_arrow(pa_table).columns()[0] + + schema, array = pyarrow_object.__arrow_c_array__() + cdef ArrowSchema* c_schema = ( + pycapsule.PyCapsule_GetPointer(schema, "arrow_schema") + ) + cdef ArrowArray* c_array = ( + pycapsule.PyCapsule_GetPointer(array, "arrow_array") + ) + + cdef unique_ptr[column] c_result + with nogil: + c_result = move(cpp_from_arrow_column(c_schema, c_array)) + + # The capsule destructors should release automatically for us, but we + # choose to do it explicitly here for clarity. + c_schema.release(c_schema) + c_array.release(c_array) + + return Column.from_libcudf(move(c_result)) @singledispatch diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index eb1017c0663..34faa9aae23 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -260,7 +260,7 @@ cdef class SinkInfo: unique_ptr[data_sink](new iobase_data_sink(s.buffer)) ) data_sinks.push_back(self.sink_storage.back().get()) - elif isinstance(sinks[0], str): + elif initial_sink_cls is str: paths.reserve(len(sinks)) for s in sinks: paths.push_back( s.encode()) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd index 471b78505fb..2151da28d4b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd @@ -7,6 +7,7 @@ from pyarrow.lib cimport CScalar, CTable from cudf._lib.types import cudf_to_np_types, np_to_cudf_types +from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view @@ -16,6 +17,19 @@ cdef extern from "dlpack/dlpack.h" nogil: ctypedef struct DLManagedTensor: void(*deleter)(DLManagedTensor*) except + + +# The Arrow structs are not namespaced. +cdef extern from "cudf/interop.hpp" nogil: + cdef struct ArrowSchema: + void (*release)(ArrowSchema*) noexcept nogil + + cdef struct ArrowArray: + void (*release)(ArrowArray*) noexcept nogil + + cdef struct ArrowArrayStream: + void (*release)(ArrowArrayStream*) noexcept nogil + + cdef extern from "cudf/interop.hpp" namespace "cudf" \ nogil: cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor @@ -42,3 +56,9 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \ const scalar& input, column_metadata metadata, ) except + + + cdef unique_ptr[table] from_arrow_stream(ArrowArrayStream* input) except + + cdef unique_ptr[column] from_arrow_column( + const ArrowSchema* schema, + const ArrowArray* input + ) except + diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5db6fd904a9..e7a2863da8c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -959,6 +959,15 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: raise NotImplementedError() def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: + if len(self) == 0: + dtype = cudf.dtype(dtype) + if self.dtype == dtype: + if copy: + return self.copy() + else: + return self + else: + return column_empty(0, dtype=dtype, masked=self.nullable) if copy: col = self.copy() else: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 121076b69ce..c10aceba9f4 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -280,8 +280,8 @@ def __contains__(self, item: ScalarLike) -> bool: return False elif ts.tzinfo is not None: ts = ts.tz_convert(None) - return ts.to_numpy().astype("int64") in self.as_numerical_column( - "int64" + return ts.to_numpy().astype("int64") in cast( + "cudf.core.column.NumericalColumn", self.astype("int64") ) @functools.cached_property @@ -503,9 +503,9 @@ def mean( self, skipna=None, min_count: int = 0, dtype=np.float64 ) -> ScalarLike: return pd.Timestamp( - self.as_numerical_column("int64").mean( - skipna=skipna, min_count=min_count, dtype=dtype - ), + cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).mean(skipna=skipna, min_count=min_count, dtype=dtype), unit=self.time_unit, ).as_unit(self.time_unit) @@ -517,7 +517,7 @@ def std( ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical_column("int64").std( + cast("cudf.core.column.NumericalColumn", self.astype("int64")).std( skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof ) * _unit_to_nanoseconds_conversion[self.time_unit], @@ -525,7 +525,9 @@ def std( def median(self, skipna: bool | None = None) -> pd.Timestamp: return pd.Timestamp( - self.as_numerical_column("int64").median(skipna=skipna), + cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).median(skipna=skipna), unit=self.time_unit, ).as_unit(self.time_unit) @@ -534,18 +536,18 @@ def cov(self, other: DatetimeColumn) -> float: raise TypeError( f"cannot perform cov with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").cov( - other.as_numerical_column("int64") - ) + return cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) def corr(self, other: DatetimeColumn) -> float: if not isinstance(other, DatetimeColumn): raise TypeError( f"cannot perform corr with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").corr( - other.as_numerical_column("int64") - ) + return cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) def quantile( self, @@ -554,7 +556,7 @@ def quantile( exact: bool, return_scalar: bool, ) -> ColumnBase: - result = self.as_numerical_column("int64").quantile( + result = self.astype("int64").quantile( q=q, interpolation=interpolation, exact=exact, @@ -645,12 +647,12 @@ def indices_of( ) -> cudf.core.column.NumericalColumn: value = column.as_column( pd.to_datetime(value), dtype=self.dtype - ).as_numerical_column("int64") - return self.as_numerical_column("int64").indices_of(value) + ).astype("int64") + return self.astype("int64").indices_of(value) @property def is_unique(self) -> bool: - return self.as_numerical_column("int64").is_unique + return self.astype("int64").is_unique def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index d66908b5f94..3e238d65cff 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -157,7 +157,7 @@ def normalize_binop_value(self, other): "Decimal columns only support binary operations with " "integer numerical columns." ) - other = other.as_decimal_column( + other = other.astype( self.dtype.__class__(self.dtype.__class__.MAX_PRECISION, 0) ) elif not isinstance(other, DecimalBaseColumn): diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index f24ca3fdad1..d09a1f66539 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -4,7 +4,7 @@ import cudf from cudf.core.column import StructColumn -from cudf.core.dtypes import CategoricalDtype, IntervalDtype +from cudf.core.dtypes import IntervalDtype class IntervalColumn(StructColumn): @@ -87,20 +87,16 @@ def copy(self, deep=True): def as_interval_column(self, dtype): if isinstance(dtype, IntervalDtype): - if isinstance(self.dtype, CategoricalDtype): - new_struct = self._get_decategorized_column() - return IntervalColumn.from_struct_column(new_struct) - else: - return IntervalColumn( - size=self.size, - dtype=dtype, - mask=self.mask, - offset=self.offset, - null_count=self.null_count, - children=tuple( - child.astype(dtype.subtype) for child in self.children - ), - ) + return IntervalColumn( + size=self.size, + dtype=dtype, + mask=self.mask, + offset=self.offset, + null_count=self.null_count, + children=tuple( + child.astype(dtype.subtype) for child in self.children + ), + ) else: raise ValueError("dtype must be IntervalDtype") diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 8f41bcb6422..5a0171bbbdc 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -107,7 +107,9 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool: # np.timedelta64 raises ValueError, hence `item` # cannot exist in `self`. return False - return item.view("int64") in self.as_numerical_column("int64") + return item.view("int64") in cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ) @property def values(self): @@ -132,9 +134,7 @@ def to_arrow(self) -> pa.Array: self.mask_array_view(mode="read").copy_to_host() ) data = pa.py_buffer( - self.as_numerical_column("int64") - .data_array_view(mode="read") - .copy_to_host() + self.astype("int64").data_array_view(mode="read").copy_to_host() ) pa_dtype = np_to_pa_dtype(self.dtype) return pa.Array.from_buffers( @@ -295,13 +295,17 @@ def as_timedelta_column( def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical_column("int64").mean(skipna=skipna, dtype=dtype), + cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ).as_unit(self.time_unit) def median(self, skipna: bool | None = None) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical_column("int64").median(skipna=skipna), + cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).median(skipna=skipna), unit=self.time_unit, ).as_unit(self.time_unit) @@ -315,7 +319,7 @@ def quantile( exact: bool, return_scalar: bool, ) -> ColumnBase: - result = self.as_numerical_column("int64").quantile( + result = self.astype("int64").quantile( q=q, interpolation=interpolation, exact=exact, @@ -337,7 +341,7 @@ def sum( # Since sum isn't overridden in Numerical[Base]Column, mypy only # sees the signature from Reducible (which doesn't have the extra # parameters from ColumnBase._reduce) so we have to ignore this. - self.as_numerical_column("int64").sum( # type: ignore + self.astype("int64").sum( # type: ignore skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, @@ -351,7 +355,7 @@ def std( ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical_column("int64").std( + cast("cudf.core.column.NumericalColumn", self.astype("int64")).std( skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype ), unit=self.time_unit, @@ -362,18 +366,18 @@ def cov(self, other: TimeDeltaColumn) -> float: raise TypeError( f"cannot perform cov with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").cov( - other.as_numerical_column("int64") - ) + return cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) def corr(self, other: TimeDeltaColumn) -> float: if not isinstance(other, TimeDeltaColumn): raise TypeError( f"cannot perform corr with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").corr( - other.as_numerical_column("int64") - ) + return cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) def components(self) -> dict[str, ColumnBase]: """ diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4dfeb68b7ba..b249410c2e4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2404,7 +2404,7 @@ def scatter_by_map( if isinstance(map_index, cudf.core.column.StringColumn): cat_index = cast( cudf.core.column.CategoricalColumn, - map_index.as_categorical_column("category"), + map_index.astype("category"), ) map_index = cat_index.codes warnings.warn( diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 034849d0e71..de715191c08 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -937,7 +937,7 @@ def to_pandas(self) -> pd.IntervalDtype: def __eq__(self, other): if isinstance(other, str): # This means equality isn't transitive but mimics pandas - return other == self.name + return other in (self.name, str(self)) return ( type(self) == type(other) and self.subtype == other.subtype diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9bac75dc6ac..253d200f7d4 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -927,7 +927,7 @@ def from_arrow(cls, data: pa.Table) -> Self: # of column is 0 (i.e., empty) then we will have an # int8 column in result._data[name] returned by libcudf, # which needs to be type-casted to 'category' dtype. - result[name] = result[name].as_categorical_column("category") + result[name] = result[name].astype("category") elif ( pandas_dtypes.get(name) == "empty" and np_dtypes.get(name) == "object" @@ -936,7 +936,7 @@ def from_arrow(cls, data: pa.Table) -> Self: # is specified as 'empty' and np_dtypes as 'object', # hence handling this special case to type-cast the empty # float column to str column. - result[name] = result[name].as_string_column(cudf.dtype("str")) + result[name] = result[name].astype(cudf.dtype("str")) elif name in data.column_names and isinstance( data[name].type, ( diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 73a1cd26367..a5fed02cbed 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -229,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec: else: key = cudf.core.column.as_column(key) if isinstance(key, cudf.core.column.CategoricalColumn): - key = key.as_numerical_column(key.codes.dtype) + key = key.astype(key.codes.dtype) if is_bool_dtype(key.dtype): return MaskIndexer(BooleanMask(key, n)) elif len(key) == 0: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 97b6bbec2d4..4a60470fafa 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3107,10 +3107,12 @@ def value_counts( # Pandas returns an IntervalIndex as the index of res # this condition makes sure we do too if bins is given if bins is not None and len(res) == len(res.index.categories): - int_index = IntervalColumn.as_interval_column( - res.index._column, res.index.categories.dtype + interval_col = IntervalColumn.from_struct_column( + res.index._column._get_decategorized_column() + ) + res.index = cudf.IntervalIndex._from_data( + {res.index.name: interval_col} ) - res.index = int_index res.name = result_name return res diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 68b23f1e059..ef6b86a04a7 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -115,11 +115,11 @@ def to_numeric(arg, errors="raise", downcast=None): dtype = col.dtype if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype): - col = col.as_numerical_column(cudf.dtype("int64")) + col = col.astype(cudf.dtype("int64")) elif isinstance(dtype, CategoricalDtype): cat_dtype = col.dtype.type if _is_non_decimal_numeric_dtype(cat_dtype): - col = col.as_numerical_column(cat_dtype) + col = col.astype(cat_dtype) else: try: col = _convert_str_col( @@ -146,8 +146,8 @@ def to_numeric(arg, errors="raise", downcast=None): raise ValueError("Unrecognized datatype") # str->float conversion may require lower precision - if col.dtype == cudf.dtype("f"): - col = col.as_numerical_column("d") + if col.dtype == cudf.dtype("float32"): + col = col.astype("float64") if downcast: if downcast == "float": @@ -205,7 +205,7 @@ def _convert_str_col(col, errors, _downcast=None): is_integer = libstrings.is_integer(col) if is_integer.all(): - return col.as_numerical_column(dtype=cudf.dtype("i8")) + return col.astype(dtype=cudf.dtype("i8")) col = _proc_inf_empty_strings(col) @@ -218,9 +218,9 @@ def _convert_str_col(col, errors, _downcast=None): "limited by float32 precision." ) ) - return col.as_numerical_column(dtype=cudf.dtype("f")) + return col.astype(dtype=cudf.dtype("float32")) else: - return col.as_numerical_column(dtype=cudf.dtype("d")) + return col.astype(dtype=cudf.dtype("float64")) else: if errors == "coerce": col = libcudf.string_casting.stod(col) diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index 1b395c09ba8..5eeea87d8e0 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -188,3 +188,9 @@ def test_from_pandas_intervaldtype(): result = cudf.from_pandas(dtype) expected = cudf.IntervalDtype("int64", closed="left") assert_eq(result, expected) + + +def test_intervaldtype_eq_string_with_attributes(): + dtype = cudf.IntervalDtype("int64", closed="left") + assert dtype == "interval" + assert dtype == "interval[int64, left]" diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 467d0c46ae7..8ed78d804bf 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2757,8 +2757,6 @@ def test_series_from_large_string(pa_type): assert_eq(expected, got) - assert pa_string_array.equals(got.to_arrow()) - @pytest.mark.parametrize( "scalar", @@ -2873,3 +2871,42 @@ def test_nunique_all_null(dropna): result = pd_ser.nunique(dropna=dropna) expected = cudf_ser.nunique(dropna=dropna) assert result == expected + + +@pytest.mark.parametrize( + "type1", + [ + "category", + "interval[int64, right]", + "int64", + "float64", + "str", + "datetime64[ns]", + "timedelta64[ns]", + ], +) +@pytest.mark.parametrize( + "type2", + [ + "category", + "interval[int64, right]", + "int64", + "float64", + "str", + "datetime64[ns]", + "timedelta64[ns]", + ], +) +@pytest.mark.parametrize( + "as_dtype", [lambda x: x, cudf.dtype], ids=["string", "object"] +) +@pytest.mark.parametrize("copy", [True, False]) +def test_empty_astype_always_castable(type1, type2, as_dtype, copy): + ser = cudf.Series([], dtype=as_dtype(type1)) + result = ser.astype(as_dtype(type2), copy=copy) + expected = cudf.Series([], dtype=as_dtype(type2)) + assert_eq(result, expected) + if not copy and cudf.dtype(type1) == cudf.dtype(type2): + assert ser._column is result._column + else: + assert ser._column is not result._column diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 16cfd9b9749..fe859c8d958 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -978,15 +978,15 @@ def collect_agg(self, *, depth: int) -> AggInfo: class Agg(Expr): __slots__ = ("name", "options", "op", "request", "children") _non_child = ("dtype", "name", "options") - children: tuple[Expr] + children: tuple[Expr, ...] def __init__( - self, dtype: plc.DataType, name: str, options: Any, value: Expr + self, dtype: plc.DataType, name: str, options: Any, *children: Expr ) -> None: super().__init__(dtype) self.name = name self.options = options - self.children = (value,) + self.children = children if name not in Agg._SUPPORTED: raise NotImplementedError( f"Unsupported aggregation {name=}" diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index abe26b14a90..9b3096becd4 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -15,6 +15,7 @@ import dataclasses import itertools +import json import types from functools import cache from typing import TYPE_CHECKING, Any, Callable, ClassVar @@ -180,8 +181,10 @@ def __post_init__(self): class Scan(IR): """Input from files.""" - typ: Any + typ: str """What type of file are we reading? Parquet, CSV, etc...""" + options: tuple[Any, ...] + """Type specific options, as json-encoded strings.""" paths: list[str] """List of paths to read from.""" file_options: Any @@ -211,17 +214,21 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: with_columns = options.with_columns row_index = options.row_index if self.typ == "csv": + opts, cloud_opts = map(json.loads, self.options) df = DataFrame.from_cudf( cudf.concat( [cudf.read_csv(p, usecols=with_columns) for p in self.paths] ) ) elif self.typ == "parquet": + opts, cloud_opts = map(json.loads, self.options) cdf = cudf.read_parquet(self.paths, columns=with_columns) assert isinstance(cdf, cudf.DataFrame) df = DataFrame.from_cudf(cdf) else: - assert_never(self.typ) + raise NotImplementedError( + f"Unhandled scan type: {self.typ}" + ) # pragma: no cover; post init trips first if row_index is not None: name, offset = row_index dtype = self.schema[name] diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index f4bf07ae1e0..4bb31b424ab 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -87,9 +87,11 @@ def _( def _( node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: + typ, *options = node.scan_type return ir.Scan( schema, - node.scan_type, + typ, + tuple(options), node.paths, node.file_options, translate_named_expr(visitor, n=node.predicate) @@ -445,7 +447,17 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex dtype, node.name, node.options, - translate_expr(visitor, n=node.arguments), + *(translate_expr(visitor, n=n) for n in node.arguments), + ) + + +@_translate_expr.register +def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: + return expr.Ternary( + dtype, + translate_expr(visitor, n=node.predicate), + translate_expr(visitor, n=node.truthy), + translate_expr(visitor, n=node.falsy), ) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index effa4861e0c..bf4673fcc50 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -20,7 +20,7 @@ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ "cudf==24.8.*,>=0.0.0a0", - "polars>=0.20.30", + "polars>=1.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers",