From 1cea1eaf6c1e87e65729897dd9bbedc4bdc5e7ab Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 25 Jul 2024 16:26:34 -0400 Subject: [PATCH 01/12] Don't export bs_thread_pool (#16398) ## Description cudf does not currently export any headers that depend on bs_thread_pool, and having it as a dependency is currently causing problems for consumers. Avoid exporting it since it's not needed. ## Checklist - [ ] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [ ] New or existing tests cover these changes. - [ ] The documentation is up to date with these changes. --- cpp/cmake/thirdparty/get_thread_pool.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake index 235bf409058..777e16d9a4f 100644 --- a/cpp/cmake/thirdparty/get_thread_pool.cmake +++ b/cpp/cmake/thirdparty/get_thread_pool.cmake @@ -18,7 +18,7 @@ function(find_and_configure_thread_pool) include(${rapids-cmake-dir}/cpm/bs_thread_pool.cmake) # Find or install thread-pool - rapids_cpm_bs_thread_pool(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) + rapids_cpm_bs_thread_pool() endfunction() From cd762b4eb1fd55a0bc5079ed69bfc04426f10e60 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 26 Jul 2024 08:08:01 -1000 Subject: [PATCH 02/12] Gate ArrowStringArrayNumpySemantics cudf.pandas proxy behind version check (#16401) ## Description `ArrowStringArrayNumpySemantics` was newly added in 2.1: https://github.com/pandas-dev/pandas/blob/2.1.x/pandas/core/arrays/string_arrow.py#L488, so putting the proxy wrapper behind a version check for pandas 2.0.x compat ```ipython In [1]: %load_ext cudf.pandas In [2]: import pandas as pd In [3]: pd.__version__ Out[3]: '2.0.0' ``` ## Checklist - [ ] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [ ] New or existing tests cover these changes. - [ ] The documentation is up to date with these changes. --- python/cudf/cudf/pandas/_wrappers/pandas.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 59a243dd7c4..478108f36f1 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -26,6 +26,7 @@ ) import cudf +import cudf.core._compat from ..annotation import nvtx from ..fast_slow_proxy import ( @@ -556,13 +557,14 @@ def Index__setattr__(self, name, value): }, ) -ArrowStringArrayNumpySemantics = make_final_proxy_type( - "ArrowStringArrayNumpySemantics", - _Unusable, - pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), -) +if cudf.core._compat.PANDAS_GE_210: + ArrowStringArrayNumpySemantics = make_final_proxy_type( + "ArrowStringArrayNumpySemantics", + _Unusable, + pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics, + fast_to_slow=_Unusable(), + slow_to_fast=_Unusable(), + ) ArrowStringArray = make_final_proxy_type( "ArrowStringArray", From 5dd3efba5b7e0c22dce87cf20aecb1b198677d2e Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 26 Jul 2024 16:47:49 -0400 Subject: [PATCH 03/12] Fix nightly memcheck error for empty STREAM_INTEROP_TEST (#16406) ## Description The `STREAM_INTEROP_TEST` code was commented out in #16379 so the `compute-sanitizer` returns an error for this test in the nightly cpp-memcheck tests. https://github.com/rapidsai/cudf/actions/runs/10107041505/job/27950193878#step:9:62177 This PR comments out the empty test so it is not built. The test will be re-enabled in a future release when the deprecated functions are replaced. ## Checklist - [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [x] New or existing tests cover these changes. - [x] The documentation is up to date with these changes. --- cpp/tests/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 88187623930..22827484f9a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -689,7 +689,10 @@ ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE tes ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing) -ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) +# Deprecation from 16297 and fixes in 16379 caused this test to be empty This will be reenabled once +# the deprecated APIs have been replaced in 24.10. +# +# ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) From 473dec55abd1a3d9d540c541443f831d18ebb532 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Fri, 26 Jul 2024 14:45:12 -0700 Subject: [PATCH 04/12] Add query 10 to the TPC-H suite (#16392) Adds Q10 to the TPC-H benchmark suite Authors: - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/16392 --- cpp/examples/tpch/CMakeLists.txt | 4 + cpp/examples/tpch/q1.cpp | 2 +- cpp/examples/tpch/q10.cpp | 166 +++++++++++++++++++++++++++++++ cpp/examples/tpch/q5.cpp | 20 ++-- cpp/examples/tpch/q6.cpp | 2 +- 5 files changed, 182 insertions(+), 12 deletions(-) create mode 100644 cpp/examples/tpch/q10.cpp diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt index 1b91d07e148..373a6d72d56 100644 --- a/cpp/examples/tpch/CMakeLists.txt +++ b/cpp/examples/tpch/CMakeLists.txt @@ -30,3 +30,7 @@ target_compile_features(tpch_q6 PRIVATE cxx_std_17) add_executable(tpch_q9 q9.cpp) target_link_libraries(tpch_q9 PRIVATE cudf::cudf) target_compile_features(tpch_q9 PRIVATE cxx_std_17) + +add_executable(tpch_q10 q10.cpp) +target_link_libraries(tpch_q10 PRIVATE cudf::cudf) +target_compile_features(tpch_q10 PRIVATE cxx_std_17) diff --git a/cpp/examples/tpch/q1.cpp b/cpp/examples/tpch/q1.cpp index 1bdf039da4a..fe03320b888 100644 --- a/cpp/examples/tpch/q1.cpp +++ b/cpp/examples/tpch/q1.cpp @@ -124,7 +124,7 @@ int main(int argc, char const** argv) auto shipdate_upper = cudf::timestamp_scalar(days_since_epoch(1998, 9, 2), true); auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper); - auto lineitem_pred = std::make_unique( + auto const lineitem_pred = std::make_unique( cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal); // Read out the `lineitem` table from parquet file diff --git a/cpp/examples/tpch/q10.cpp b/cpp/examples/tpch/q10.cpp new file mode 100644 index 00000000000..94da46f6930 --- /dev/null +++ b/cpp/examples/tpch/q10.cpp @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../utilities/timer.hpp" +#include "utils.hpp" + +#include +#include +#include + +/** + * @file q10.cpp + * @brief Implement query 10 of the TPC-H benchmark. + * + * create view customer as select * from '/tables/scale-1/customer.parquet'; + * create view orders as select * from '/tables/scale-1/orders.parquet'; + * create view lineitem as select * from '/tables/scale-1/lineitem.parquet'; + * create view nation as select * from '/tables/scale-1/nation.parquet'; + * + * select + * c_custkey, + * c_name, + * sum(l_extendedprice * (1 - l_discount)) as revenue, + * c_acctbal, + * n_name, + * c_address, + * c_phone, + * c_comment + * from + * customer, + * orders, + * lineitem, + * nation + * where + * c_custkey = o_custkey + * and l_orderkey = o_orderkey + * and o_orderdate >= date '1993-10-01' + * and o_orderdate < date '1994-01-01' + * and l_returnflag = 'R' + * and c_nationkey = n_nationkey + * group by + * c_custkey, + * c_name, + * c_acctbal, + * c_phone, + * n_name, + * c_address, + * c_comment + * order by + * revenue desc; + */ + +/** + * @brief Calculate the revenue column + * + * @param extendedprice The extended price column + * @param discount The discount column + * @param stream The CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + */ +[[nodiscard]] std::unique_ptr calc_revenue( + cudf::column_view const& extendedprice, + cudf::column_view const& discount, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) +{ + auto const one = cudf::numeric_scalar(1); + auto const one_minus_discount = + cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr); + auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64}; + auto revenue = cudf::binary_operation(extendedprice, + one_minus_discount->view(), + cudf::binary_operator::MUL, + revenue_type, + stream, + mr); + return revenue; +} +int main(int argc, char const** argv) +{ + auto const args = parse_args(argc, argv); + + // Use a memory pool + auto resource = create_memory_resource(args.memory_resource_type); + rmm::mr::set_current_device_resource(resource.get()); + + cudf::examples::timer timer; + + // Define the column projection and filter predicate for the `orders` table + std::vector const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"}; + auto const o_orderdate_ref = cudf::ast::column_reference(std::distance( + orders_cols.begin(), std::find(orders_cols.begin(), orders_cols.end(), "o_orderdate"))); + auto o_orderdate_lower = + cudf::timestamp_scalar(days_since_epoch(1993, 10, 1), true); + auto const o_orderdate_lower_limit = cudf::ast::literal(o_orderdate_lower); + auto const o_orderdate_pred_lower = cudf::ast::operation( + cudf::ast::ast_operator::GREATER_EQUAL, o_orderdate_ref, o_orderdate_lower_limit); + auto o_orderdate_upper = + cudf::timestamp_scalar(days_since_epoch(1994, 1, 1), true); + auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper); + auto const o_orderdate_pred_upper = + cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit); + auto const orders_pred = std::make_unique( + cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper); + + auto const l_returnflag_ref = cudf::ast::column_reference(3); + auto r_scalar = cudf::string_scalar("R"); + auto const r_literal = cudf::ast::literal(r_scalar); + auto const lineitem_pred = std::make_unique( + cudf::ast::ast_operator::EQUAL, l_returnflag_ref, r_literal); + + // Read out the tables from parquet files + // while pushing down the column projections and filter predicates + auto const customer = read_parquet( + args.dataset_dir + "/customer.parquet", + {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"}); + auto const orders = + read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred)); + auto const lineitem = + read_parquet(args.dataset_dir + "/lineitem.parquet", + {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"}, + std::move(lineitem_pred)); + auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_name", "n_nationkey"}); + + // Perform the joins + auto const join_a = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"}); + auto const join_b = apply_inner_join(lineitem, orders, {"l_orderkey"}, {"o_orderkey"}); + auto const joined_table = apply_inner_join(join_a, join_b, {"c_custkey"}, {"o_custkey"}); + + // Calculate and append the `revenue` column + auto revenue = + calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount")); + (*joined_table).append(revenue, "revenue"); + + // Perform the groupby operation + auto const groupedby_table = apply_groupby( + joined_table, + groupby_context_t{ + {"c_custkey", "c_name", "c_acctbal", "c_phone", "n_name", "c_address", "c_comment"}, + { + {"revenue", {{cudf::aggregation::Kind::SUM, "revenue"}}}, + }}); + + // Perform the order by operation + auto const orderedby_table = + apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING}); + + timer.print_elapsed_millis(); + + // Write query result to a parquet file + orderedby_table->to_parquet("q10.parquet"); + return 0; +} diff --git a/cpp/examples/tpch/q5.cpp b/cpp/examples/tpch/q5.cpp index e56850b94d6..89396a6c968 100644 --- a/cpp/examples/tpch/q5.cpp +++ b/cpp/examples/tpch/q5.cpp @@ -44,14 +44,14 @@ * region * where * c_custkey = o_custkey - * and l_orderkey = o_orderkey - * and l_suppkey = s_suppkey - * and c_nationkey = s_nationkey - * and s_nationkey = n_nationkey - * and n_regionkey = r_regionkey - * and r_name = 'ASIA' - * and o_orderdate >= date '1994-01-01' - * and o_orderdate < date '1995-01-01' + * and l_orderkey = o_orderkey + * and l_suppkey = s_suppkey + * and c_nationkey = s_nationkey + * and s_nationkey = n_nationkey + * and n_regionkey = r_regionkey + * and r_name = 'ASIA' + * and o_orderdate >= date '1994-01-01' + * and o_orderdate < date '1995-01-01' * group by * n_name * order by @@ -109,7 +109,7 @@ int main(int argc, char const** argv) auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper); auto const o_orderdate_pred_upper = cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit); - auto orders_pred = std::make_unique( + auto const orders_pred = std::make_unique( cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper); // Define the column projection and filter predicate for the `region` table @@ -118,7 +118,7 @@ int main(int argc, char const** argv) region_cols.begin(), std::find(region_cols.begin(), region_cols.end(), "r_name"))); auto r_name_value = cudf::string_scalar("ASIA"); auto const r_name_literal = cudf::ast::literal(r_name_value); - auto region_pred = std::make_unique( + auto const region_pred = std::make_unique( cudf::ast::ast_operator::EQUAL, r_name_ref, r_name_literal); // Read out the tables from parquet files diff --git a/cpp/examples/tpch/q6.cpp b/cpp/examples/tpch/q6.cpp index f11b3d6ab3b..405b2ac73ca 100644 --- a/cpp/examples/tpch/q6.cpp +++ b/cpp/examples/tpch/q6.cpp @@ -84,7 +84,7 @@ int main(int argc, char const** argv) cudf::ast::ast_operator::GREATER_EQUAL, shipdate_ref, shipdate_lower_literal); auto const shipdate_pred_b = cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal); - auto lineitem_pred = std::make_unique( + auto const lineitem_pred = std::make_unique( cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b); auto lineitem = read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred)); From 24997fda194d5b8af34048a8bf275830cabbff8c Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 26 Jul 2024 18:37:30 -0700 Subject: [PATCH 05/12] Deduplicate decimal32/decimal64 to decimal128 conversion function (#16236) Closes #16194 This PR deduplicates the `convert_data_to_decimal128` function from `to_arrow.cu`, `writer_impl.cu` and `to_arrow_device.cu` to a common location. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16236 --- cpp/CMakeLists.txt | 1 + .../interop/decimal_conversion_utilities.cu | 70 +++++++++++++++++ .../interop/decimal_conversion_utilities.cuh | 44 +++++++++++ cpp/src/interop/to_arrow.cu | 8 +- cpp/src/interop/to_arrow_device.cu | 5 +- cpp/src/interop/to_arrow_host.cu | 40 +--------- cpp/src/io/parquet/writer_impl.cu | 60 ++++----------- cpp/tests/interop/to_arrow_device_test.cpp | 77 +++++++++++++++++++ 8 files changed, 220 insertions(+), 85 deletions(-) create mode 100644 cpp/src/interop/decimal_conversion_utilities.cu create mode 100644 cpp/src/interop/decimal_conversion_utilities.cuh diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 95c509efc5b..310bc99b279 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -365,6 +365,7 @@ add_library( src/interop/dlpack.cpp src/interop/from_arrow.cu src/interop/arrow_utilities.cpp + src/interop/decimal_conversion_utilities.cu src/interop/to_arrow.cu src/interop/to_arrow_device.cu src/interop/to_arrow_host.cu diff --git a/cpp/src/interop/decimal_conversion_utilities.cu b/cpp/src/interop/decimal_conversion_utilities.cu new file mode 100644 index 00000000000..2f81c754a30 --- /dev/null +++ b/cpp/src/interop/decimal_conversion_utilities.cu @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "decimal_conversion_utilities.cuh" + +#include +#include +#include + +#include + +#include + +#include + +namespace cudf { +namespace detail { + +template +std::unique_ptr convert_decimals_to_decimal128( + cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) +{ + static_assert(std::is_same_v or std::is_same_v, + "Only int32 and int64 decimal types can be converted to decimal128."); + + constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType); + auto buf = std::make_unique(column.size() * sizeof(__int128_t), stream, mr); + + thrust::for_each(rmm::exec_policy_nosync(stream, mr), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(column.size()), + [in = column.begin(), + out = reinterpret_cast(buf->data()), + BIT_WIDTH_RATIO] __device__(auto in_idx) { + auto const out_idx = in_idx * BIT_WIDTH_RATIO; + // the lowest order bits are the value, the remainder + // simply matches the sign bit to satisfy the two's + // complement integer representation of negative numbers. + out[out_idx] = in[in_idx]; +#pragma unroll BIT_WIDTH_RATIO - 1 + for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { + out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; + } + }); + + return buf; +} + +// Instantiate templates for int32_t and int64_t decimal types +template std::unique_ptr convert_decimals_to_decimal128( + cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + +template std::unique_ptr convert_decimals_to_decimal128( + cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/interop/decimal_conversion_utilities.cuh b/cpp/src/interop/decimal_conversion_utilities.cuh new file mode 100644 index 00000000000..41263147404 --- /dev/null +++ b/cpp/src/interop/decimal_conversion_utilities.cuh @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include + +namespace cudf::detail { + +/** + * @brief Convert decimal32 and decimal64 numeric data to decimal128 and return the device vector + * + * @tparam DecimalType to convert from + * + * @param column A view of the input columns + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + * + * @return A device vector containing the converted decimal128 data + */ +template +std::unique_ptr convert_decimals_to_decimal128( + cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + +} // namespace cudf::detail diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 6b163e3441e..3d41f856f4f 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -15,6 +15,7 @@ */ #include "arrow_utilities.hpp" +#include "decimal_conversion_utilities.cuh" #include "detail/arrow_allocator.hpp" #include @@ -158,8 +159,11 @@ std::shared_ptr unsupported_decimals_to_arrow(column_view input, arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - auto buf = - detail::decimals_to_arrow(input, stream, rmm::mr::get_current_device_resource()); + auto buf = detail::convert_decimals_to_decimal128( + input, stream, rmm::mr::get_current_device_resource()); + + // Synchronize stream here to ensure the decimal128 buffer is ready. + stream.synchronize(); auto const buf_size_in_bytes = buf->size(); auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu index 2eb9b912054..cea7cdebcba 100644 --- a/cpp/src/interop/to_arrow_device.cu +++ b/cpp/src/interop/to_arrow_device.cu @@ -15,6 +15,7 @@ */ #include "arrow_utilities.hpp" +#include "decimal_conversion_utilities.cuh" #include #include @@ -141,7 +142,9 @@ int construct_decimals(cudf::column_view input, nanoarrow::UniqueArray tmp; NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input)); - auto buf = detail::decimals_to_arrow(input, stream, mr); + auto buf = detail::convert_decimals_to_decimal128(input, stream, mr); + // Synchronize stream here to ensure the decimal128 buffer is ready. + stream.synchronize(); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get())); ArrowArrayMove(tmp.get(), out); diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu index c9e53ebaab7..193b3a3b5a2 100644 --- a/cpp/src/interop/to_arrow_host.cu +++ b/cpp/src/interop/to_arrow_host.cu @@ -15,6 +15,7 @@ */ #include "arrow_utilities.hpp" +#include "decimal_conversion_utilities.cuh" #include #include @@ -50,41 +51,6 @@ namespace cudf { namespace detail { -template -std::unique_ptr decimals_to_arrow(cudf::column_view input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType); - auto buf = std::make_unique(input.size() * sizeof(__int128_t), stream, mr); - - auto count = thrust::counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream, mr), - count, - count + input.size(), - [in = input.begin(), - out = reinterpret_cast(buf->data()), - BIT_WIDTH_RATIO] __device__(auto in_idx) { - auto const out_idx = in_idx * BIT_WIDTH_RATIO; - // the lowest order bits are the value, the remainder - // simply matches the sign bit to satisfy the two's - // complement integer representation of negative numbers. - out[out_idx] = in[in_idx]; -#pragma unroll BIT_WIDTH_RATIO - 1 - for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { - out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; - } - }); - - return buf; -} - -template std::unique_ptr decimals_to_arrow( - cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); - -template std::unique_ptr decimals_to_arrow( - cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); - namespace { struct dispatch_to_arrow_host { @@ -156,7 +122,9 @@ struct dispatch_to_arrow_host { NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column)); NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get()))); - auto buf = detail::decimals_to_arrow(column, stream, mr); + auto buf = detail::convert_decimals_to_decimal128(column, stream, mr); + // No need to synchronize stream here as populate_data_buffer uses the same stream to copy data + // to host. NANOARROW_RETURN_NOT_OK( populate_data_buffer(device_span<__int128_t const>( reinterpret_cast(buf->data()), column.size()), diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 2df71b77301..36a1d8377bf 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -22,6 +22,7 @@ #include "arrow_schema_writer.hpp" #include "compact_protocol_reader.hpp" #include "compact_protocol_writer.hpp" +#include "interop/decimal_conversion_utilities.cuh" #include "io/comp/nvcomp_adapter.hpp" #include "io/parquet/parquet.hpp" #include "io/parquet/parquet_gpu.hpp" @@ -1601,50 +1602,12 @@ size_t column_index_buffer_size(EncColumnChunk* ck, return ck->ck_stat_size * num_pages + column_index_truncate_length + padding + size_struct_size; } -/** - * @brief Convert decimal32 and decimal64 data to decimal128 and return the device vector - * - * @tparam DecimalType to convert from - * - * @param column A view of the input columns - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return A device vector containing the converted decimal128 data - */ -template -rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& column, - rmm::cuda_stream_view stream) -{ - size_type constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType); - - rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream); - - thrust::for_each(rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(column.size()), - [in = column.begin(), - out = reinterpret_cast(d128_buffer.data()), - BIT_WIDTH_RATIO] __device__(auto in_idx) { - auto const out_idx = in_idx * BIT_WIDTH_RATIO; - // The lowest order bits are the value, the remainder - // simply matches the sign bit to satisfy the two's - // complement integer representation of negative numbers. - out[out_idx] = in[in_idx]; -#pragma unroll BIT_WIDTH_RATIO - 1 - for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { - out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; - } - }); - - return d128_buffer; -} - /** * @brief Function to convert decimal32 and decimal64 columns to decimal128 data, * update the input table metadata, and return a new vector of column views. * * @param[in,out] table_meta The table metadata - * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers. + * @param[in,out] d128_buffers Buffers containing the converted decimal128 data. * @param input The input table * @param stream CUDA stream used for device memory operations and kernel launches * @@ -1652,7 +1615,7 @@ rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& co */ std::vector convert_decimal_columns_and_metadata( table_input_metadata& table_meta, - std::vector>& d128_vectors, + std::vector>& d128_buffers, table_view const& table, rmm::cuda_stream_view stream) { @@ -1673,28 +1636,30 @@ std::vector convert_decimal_columns_and_metadata( switch (column.type().id()) { case type_id::DECIMAL32: // Convert data to decimal128 type - d128_vectors.emplace_back(convert_data_to_decimal128(column, stream)); + d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128( + column, stream, rmm::mr::get_current_device_resource())); // Update metadata metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION); metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()})); // Create a new column view from the d128 data vector return {data_type{type_id::DECIMAL128, column.type().scale()}, column.size(), - d128_vectors.back().data(), + d128_buffers.back()->data(), column.null_mask(), column.null_count(), column.offset(), converted_children}; case type_id::DECIMAL64: // Convert data to decimal128 type - d128_vectors.emplace_back(convert_data_to_decimal128(column, stream)); + d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128( + column, stream, rmm::mr::get_current_device_resource())); // Update metadata metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION); metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()})); // Create a new column view from the d128 data vector return {data_type{type_id::DECIMAL128, column.type().scale()}, column.size(), - d128_vectors.back().data(), + d128_buffers.back()->data(), column.null_mask(), column.null_count(), column.offset(), @@ -1722,6 +1687,9 @@ std::vector convert_decimal_columns_and_metadata( std::back_inserter(converted_column_views), [&](auto elem) { return convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); }); + // Synchronize stream here to ensure all decimal128 buffers are ready. + stream.synchronize(); + return converted_column_views; } @@ -1780,13 +1748,13 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, rmm::cuda_stream_view stream) { // Container to store decimal128 converted data if needed - std::vector> d128_vectors; + std::vector> d128_buffers; // Convert decimal32/decimal64 data to decimal128 if writing arrow schema // and initialize LinkedColVector auto vec = table_to_linked_columns( (write_arrow_schema) - ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_vectors, input, stream)}) + ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_buffers, input, stream)}) : input); auto schema_tree = construct_parquet_schema_tree( diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 77da4039103..51216a8512c 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -710,6 +710,83 @@ TEST_F(ToArrowDeviceTest, StructColumn) template using fp_wrapper = cudf::test::fixed_point_column_wrapper; +TEST_F(ToArrowDeviceTest, FixedPoint32Table) +{ + using namespace numeric; + + for (auto const scale : {6, 4, 2, 0, -1, -3, -5}) { + auto const expect_data = + std::vector{-1000, -1, -1, -1, 2400, 0, 0, 0, -3456, -1, -1, -1, + 4650, 0, 0, 0, 5154, 0, 0, 0, 6800, 0, 0, 0}; + auto col = fp_wrapper({-1000, 2400, -3456, 4650, 5154, 6800}, scale_type{scale}); + std::vector> table_cols; + table_cols.emplace_back(col.release()); + auto input = cudf::table(std::move(table_cols)); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision(), + -scale)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + auto got_arrow_schema = + cudf::to_arrow_schema(input.view(), std::vector{{"a"}}); + compare_schemas(expected_schema.get(), got_arrow_schema.get()); + + auto result_dev_data = std::make_unique>( + expect_data.size(), cudf::get_default_stream()); + cudaMemcpy(result_dev_data->data(), + expect_data.data(), + sizeof(int32_t) * expect_data.size(), + cudaMemcpyHostToDevice); + + cudf::get_default_stream().synchronize(); + nanoarrow::UniqueArray expected_array; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr)); + expected_array->length = input.num_rows(); + + expected_array->children[0]->length = input.num_rows(); + NANOARROW_THROW_NOT_OK( + ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc)); + ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data = + const_cast(reinterpret_cast(input.view().column(0).null_mask())); + + auto data_ptr = reinterpret_cast(result_dev_data->data()); + NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator( + ArrowArrayBuffer(expected_array->children[0], 1), + ArrowBufferDeallocator( + [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) { + auto buf = + reinterpret_cast>*>(alloc->private_data); + delete buf; + }, + new std::unique_ptr>(std::move(result_dev_data))))); + ArrowArrayBuffer(expected_array->children[0], 1)->data = data_ptr; + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); + + auto got_arrow_array = cudf::to_arrow_device(input.view()); + ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); + ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); + compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); + + got_arrow_array = cudf::to_arrow_device(std::move(input)); + ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); + ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); + compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); + } +} + TEST_F(ToArrowDeviceTest, FixedPoint64Table) { using namespace numeric; From a51964ed8b00c3c88d463e329af7ec8378642343 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 29 Jul 2024 08:42:27 -0500 Subject: [PATCH 06/12] Fix a `pandas-2.0` missing attribute error (#16416) `NumpyEADtype` is a 2.1.0+ change, this PR handles the missing attribute error in pandas-2.0 --- python/cudf/cudf/core/dtypes.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index de715191c08..27afec18b4e 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -17,10 +17,15 @@ from pandas.core.arrays.arrow.extension_types import ArrowIntervalType import cudf -from cudf.core._compat import PANDAS_LT_300 +from cudf.core._compat import PANDAS_GE_210, PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.utils.docutils import doc_apply +if PANDAS_GE_210: + PANDAS_NUMPY_DTYPE = pd.core.dtypes.dtypes.NumpyEADtype +else: + PANDAS_NUMPY_DTYPE = pd.core.dtypes.dtypes.PandasDtype + if TYPE_CHECKING: from cudf._typing import Dtype from cudf.core.buffer import Buffer @@ -72,7 +77,7 @@ def dtype(arbitrary): return np.dtype("object") else: return dtype(pd_dtype.numpy_dtype) - elif isinstance(pd_dtype, pd.core.dtypes.dtypes.NumpyEADtype): + elif isinstance(pd_dtype, PANDAS_NUMPY_DTYPE): return dtype(pd_dtype.numpy_dtype) elif isinstance(pd_dtype, pd.CategoricalDtype): return cudf.CategoricalDtype.from_pandas(pd_dtype) From 18c1465b597284d8b558964cc0ca48de7da60a17 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Jul 2024 06:06:07 -1000 Subject: [PATCH 07/12] Align ewm APIs with pandas 2.x (#16413) These all currently are not implemented and raise a `NotImplementedError` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16413 --- python/cudf/cudf/core/window/ewm.py | 52 ++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py index bb153d4b549..1203a840076 100644 --- a/python/cudf/cudf/core/window/ewm.py +++ b/python/cudf/cudf/core/window/ewm.py @@ -114,23 +114,57 @@ def __init__( self.adjust = adjust self.com = get_center_of_mass(com, span, halflife, alpha) - def mean(self): + def online(self, engine: str = "numba", engine_kwargs=None): + """ + Return an ``OnlineExponentialMovingWindow`` object to calculate + exponentially moving window aggregations in an online method. + + Currently not supported. + """ + raise NotImplementedError("online is currently not supported.") + + def mean( + self, numeric_only: bool = False, engine=None, engine_kwargs=None + ): """ Calculate the ewm (exponential weighted moment) mean. """ + if numeric_only is not False: + raise NotImplementedError( + "numeric_only is currently not supported." + ) + if engine is not None: + raise NotImplementedError( + "engine is non-functional and added for compatibility with pandas." + ) + if engine_kwargs is not None: + raise NotImplementedError( + "engine_kwargs is non-functional and added for compatibility with pandas." + ) return self._apply_agg("ewma") - def var(self, bias): - raise NotImplementedError("ewmvar not yet supported.") + def sum(self, numeric_only: bool = False, engine=None, engine_kwargs=None): + raise NotImplementedError("sum not yet supported.") - def std(self, bias): - raise NotImplementedError("ewmstd not yet supported.") + def var(self, bias: bool = False, numeric_only: bool = False): + raise NotImplementedError("var not yet supported.") - def corr(self, other): - raise NotImplementedError("ewmcorr not yet supported.") + def std(self, bias: bool = False, numeric_only: bool = False): + raise NotImplementedError("std not yet supported.") - def cov(self, other): - raise NotImplementedError("ewmcov not yet supported.") + def corr( + self, other, pairwise: bool | None = None, numeric_only: bool = False + ): + raise NotImplementedError("corr not yet supported.") + + def cov( + self, + other, + pairwise: bool | None = None, + bias: bool = False, + numeric_only: bool = False, + ): + raise NotImplementedError("cov not yet supported.") def _apply_agg_series(self, sr, agg_name): if not is_numeric_dtype(sr.dtype): From 58f47242fe04b1e25fd42e1e45e8c15417140777 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Jul 2024 06:09:21 -1000 Subject: [PATCH 08/12] Align groupby APIs with pandas 2.x (#16403) The following breaking APIs are affected: * `apply` * `transform` * `describe` The rest of the APIs are non-breaking and generally will raise a `NotImplementedError` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16403 --- .../source/user_guide/api_docs/groupby.rst | 3 +- python/cudf/cudf/core/groupby/groupby.py | 629 ++++++++++++++---- python/cudf/cudf/core/resample.py | 6 +- python/cudf/cudf/tests/test_groupby.py | 25 + 4 files changed, 514 insertions(+), 149 deletions(-) diff --git a/docs/cudf/source/user_guide/api_docs/groupby.rst b/docs/cudf/source/user_guide/api_docs/groupby.rst index 80811efa33f..ca29087cbf9 100644 --- a/docs/cudf/source/user_guide/api_docs/groupby.rst +++ b/docs/cudf/source/user_guide/api_docs/groupby.rst @@ -68,7 +68,6 @@ Computations / descriptive stats GroupBy.std GroupBy.sum GroupBy.var - GroupBy.corr GroupBy.cov The following methods are available in both ``SeriesGroupBy`` and @@ -81,6 +80,7 @@ application to columns of a specific data type. :toctree: api/ DataFrameGroupBy.bfill + DataFrameGroupBy.corr DataFrameGroupBy.count DataFrameGroupBy.cumcount DataFrameGroupBy.cummax @@ -102,5 +102,6 @@ The following methods are available only for ``SeriesGroupBy`` objects. .. autosummary:: :toctree: api/ + SeriesGroupBy.corr SeriesGroupBy.nunique SeriesGroupBy.unique diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 1646c5042fd..3cfbd1d736a 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -8,7 +8,7 @@ import warnings from collections import abc from functools import cached_property -from typing import TYPE_CHECKING, Any, Iterable +from typing import TYPE_CHECKING, Any, Iterable, Literal import cupy as cp import numpy as np @@ -306,6 +306,18 @@ def __iter__(self): grouped_values[offsets[i] : offsets[i + 1]], ) + def __len__(self) -> int: + return self.ngroups + + @property + def ngroups(self) -> int: + _, offsets, _, _ = self._grouped() + return len(offsets) - 1 + + @property + def ndim(self) -> int: + return self.obj.ndim + @property def dtypes(self): """ @@ -457,10 +469,20 @@ def size(self): ) @_performance_tracking - def cumcount(self): + def cumcount(self, ascending: bool = True): """ Return the cumulative count of keys in each group. + + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from length of group - 1 to 0. + Currently not supported """ + if ascending is not True: + raise NotImplementedError( + "ascending is currently not implemented." + ) return ( cudf.Series( cudf.core.column.column_empty( @@ -527,7 +549,7 @@ def _groupby(self): ) @_performance_tracking - def agg(self, func): + def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): """ Apply aggregation(s) to the groups. @@ -615,6 +637,22 @@ def agg(self, func): 1 1.5 1.75 2.0 2.0 2 3.0 3.00 1.0 1.0 """ + if engine is not None: + raise NotImplementedError( + "engine is non-functional and added for compatibility with pandas" + ) + if engine_kwargs is not None: + raise NotImplementedError( + "engine_kwargs is non-functional added for compatibility with pandas" + ) + if args: + raise NotImplementedError( + "Passing args to func is currently not supported." + ) + if kwargs: + raise NotImplementedError( + "Passing kwargs to func is currently not supported." + ) column_names, columns, normalized_aggs = self._normalize_aggs(func) orig_dtypes = tuple(c.dtype for c in columns) @@ -935,12 +973,13 @@ def tail(self, n: int = 5, *, preserve_order: bool = True): ) @_performance_tracking - def nth(self, n): + def nth(self, n, dropna: Literal["any", "all", None] = None): """ Return the nth row from each group. """ - - self.obj["__groupbynth_order__"] = range(0, len(self.obj)) + if dropna is not None: + raise NotImplementedError("dropna is not currently supported.") + self.obj["__groupbynth_order__"] = range(0, len(self.obj)) # type: ignore[index] # We perform another groupby here to have the grouping columns # be a part of dataframe columns. result = self.obj.groupby(self.grouping.keys).agg(lambda x: x.nth(n)) @@ -1423,13 +1462,13 @@ def _post_process_chunk_results( @_performance_tracking def apply( - self, function, *args, engine="auto", include_groups: bool = True + self, func, *args, engine="auto", include_groups: bool = True, **kwargs ): """Apply a python transformation function over the grouped chunk. Parameters ---------- - function : callable + func : callable The python transformation function that will be applied on the grouped chunk. args : tuple @@ -1452,6 +1491,9 @@ def apply( When True, will attempt to apply ``func`` to the groupings in the case that they are columns of the DataFrame. In the future, this will default to ``False``. + kwargs : dict + Optional keyword arguments to pass to the function. + Currently not supported Examples -------- @@ -1528,13 +1570,17 @@ def mult(df): dtype: int64 """ + if kwargs: + raise NotImplementedError( + "Passing kwargs to func is currently not supported." + ) if self.obj.empty: - if function in {"count", "size", "idxmin", "idxmax"}: + if func in {"count", "size", "idxmin", "idxmax"}: res = cudf.Series([], dtype="int64") else: res = self.obj.copy(deep=True) res.index = self.grouping.keys - if function in {"sum", "product"}: + if func in {"sum", "product"}: # For `sum` & `product`, boolean types # will need to result in `int64` type. for name, col in res._data.items(): @@ -1542,20 +1588,20 @@ def mult(df): res._data[name] = col.astype("int") return res - if not callable(function): - raise TypeError(f"type {type(function)} is not callable") + if not callable(func): + raise TypeError(f"type {type(func)} is not callable") group_names, offsets, group_keys, grouped_values = self._grouped( include_groups=include_groups ) if engine == "auto": - if _can_be_jitted(grouped_values, function, args): + if _can_be_jitted(grouped_values, func, args): engine = "jit" else: engine = "cudf" if engine == "jit": result = self._jit_groupby_apply( - function, + func, group_names, offsets, group_keys, @@ -1564,7 +1610,7 @@ def mult(df): ) elif engine == "cudf": result = self._iterative_groupby_apply( - function, + func, group_names, offsets, group_keys, @@ -1744,12 +1790,14 @@ def _broadcast(self, values: cudf.Series) -> cudf.Series: return values @_performance_tracking - def transform(self, function): + def transform( + self, func, *args, engine=None, engine_kwargs=None, **kwargs + ): """Apply an aggregation, then broadcast the result to the group size. Parameters ---------- - function: str or callable + func: str or callable Aggregation to apply to each group. Note that the set of operations currently supported by `transform` is identical to that supported by the `agg` method. @@ -1778,18 +1826,35 @@ def transform(self, function): -------- agg """ - if not (isinstance(function, str) or callable(function)): + if engine is not None: + raise NotImplementedError( + "engine is non-functional and added for compatibility with pandas" + ) + if engine_kwargs is not None: + raise NotImplementedError( + "engine_kwargs is non-functional added for compatibility with pandas" + ) + if args: + raise NotImplementedError( + "Passing args to func is currently not supported." + ) + if kwargs: + raise NotImplementedError( + "Passing kwargs to func is currently not supported." + ) + + if not (isinstance(func, str) or callable(func)): raise TypeError( "Aggregation must be a named aggregation or a callable" ) try: - result = self.agg(function) + result = self.agg(func) except TypeError as e: raise NotImplementedError( "Currently, `transform()` supports only aggregations." ) from e # If the aggregation is a scan, don't broadcast - if libgroupby._is_all_scan_aggregate([[function]]): + if libgroupby._is_all_scan_aggregate([[func]]): if len(result) != len(self.obj): raise AssertionError( "Unexpected result length for scan transform" @@ -1824,7 +1889,7 @@ def func(x): return self.agg(func) @_performance_tracking - def describe(self, include=None, exclude=None): + def describe(self, percentiles=None, include=None, exclude=None): """ Generate descriptive statistics that summarizes the central tendency, dispersion and shape of a dataset's distribution, excluding NaN values. @@ -1833,6 +1898,10 @@ def describe(self, include=None, exclude=None): Parameters ---------- + percentiles : list-like of numbers, optional + The percentiles to include in the output. + Currently not supported. + include: 'all', list-like of dtypes or None (default), optional list of data types to include in the result. Ignored for Series. @@ -1869,8 +1938,12 @@ def describe(self, include=None, exclude=None): 90 1 24.0 24.0 24.0 24.0 24.0 24.0 """ - if exclude is not None and include is not None: - raise NotImplementedError + if percentiles is not None: + raise NotImplementedError("percentiles is currently not supported") + if exclude is not None: + raise NotImplementedError("exclude is currently not supported") + if include is not None: + raise NotImplementedError("include is currently not supported") res = self.agg( [ @@ -1896,69 +1969,7 @@ def describe(self, include=None, exclude=None): return res @_performance_tracking - def corr(self, method="pearson", min_periods=1): - """ - Compute pairwise correlation of columns, excluding NA/null values. - - Parameters - ---------- - method: {"pearson", "kendall", "spearman"} or callable, - default "pearson". Currently only the pearson correlation - coefficient is supported. - - min_periods: int, optional - Minimum number of observations required per pair of columns - to have a valid result. - - Returns - ------- - DataFrame - Correlation matrix. - - Examples - -------- - >>> import cudf - >>> gdf = cudf.DataFrame({ - ... "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - ... "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], - ... "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], - ... "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]}) - >>> gdf - id val1 val2 val3 - 0 a 5 4 4 - 1 a 4 5 5 - 2 a 6 6 6 - 3 b 4 1 1 - 4 b 8 2 2 - 5 b 7 9 9 - 6 c 4 8 8 - 7 c 5 5 5 - 8 c 2 1 1 - >>> gdf.groupby("id").corr(method="pearson") - val1 val2 val3 - id - a val1 1.000000 0.500000 0.500000 - val2 0.500000 1.000000 1.000000 - val3 0.500000 1.000000 1.000000 - b val1 1.000000 0.385727 0.385727 - val2 0.385727 1.000000 1.000000 - val3 0.385727 1.000000 1.000000 - c val1 1.000000 0.714575 0.714575 - val2 0.714575 1.000000 1.000000 - val3 0.714575 1.000000 1.000000 - """ - - if method.lower() not in ("pearson",): - raise NotImplementedError( - "Only pearson correlation is currently supported" - ) - - return self._cov_or_corr( - lambda x: x.corr(method, min_periods), "Correlation" - ) - - @_performance_tracking - def cov(self, min_periods=0, ddof=1): + def cov(self, min_periods=0, ddof=1, numeric_only: bool = False): """ Compute the pairwise covariance among the columns of a DataFrame, excluding NA/null values. @@ -2042,6 +2053,10 @@ def cov(self, min_periods=0, ddof=1): val2 3.833333 12.333333 12.333333 val3 3.833333 12.333333 12.333333 """ + if numeric_only is not False: + raise NotImplementedError( + "numeric_only is currently not supported." + ) return self._cov_or_corr( lambda x: x.cov(min_periods, ddof), "Covariance" @@ -2137,7 +2152,13 @@ def _cov_or_corr(self, func, method_name): return res @_performance_tracking - def var(self, ddof=1): + def var( + self, + ddof=1, + engine=None, + engine_kwargs=None, + numeric_only: bool = False, + ): """Compute the column-wise variance of the values in each group. Parameters @@ -2146,6 +2167,18 @@ def var(self, ddof=1): The delta degrees of freedom. N - ddof is the divisor used to normalize the variance. """ + if engine is not None: + raise NotImplementedError( + "engine is non-functional and added for compatibility with pandas" + ) + if engine_kwargs is not None: + raise NotImplementedError( + "engine_kwargs is non-functional added for compatibility with pandas" + ) + if numeric_only is not False: + raise NotImplementedError( + "numeric_only is currently not supported." + ) def func(x): return getattr(x, "var")(ddof=ddof) @@ -2153,7 +2186,13 @@ def func(x): return self.agg(func) @_performance_tracking - def std(self, ddof=1): + def std( + self, + ddof=1, + engine=None, + engine_kwargs=None, + numeric_only: bool = False, + ): """Compute the column-wise std of the values in each group. Parameters @@ -2162,6 +2201,18 @@ def std(self, ddof=1): The delta degrees of freedom. N - ddof is the divisor used to normalize the standard deviation. """ + if engine is not None: + raise NotImplementedError( + "engine is non-functional and added for compatibility with pandas" + ) + if engine_kwargs is not None: + raise NotImplementedError( + "engine_kwargs is non-functional added for compatibility with pandas" + ) + if numeric_only is not False: + raise NotImplementedError( + "numeric_only is currently not supported." + ) def func(x): return getattr(x, "std")(ddof=ddof) @@ -2169,7 +2220,9 @@ def func(x): return self.agg(func) @_performance_tracking - def quantile(self, q=0.5, interpolation="linear"): + def quantile( + self, q=0.5, interpolation="linear", numeric_only: bool = False + ): """Compute the column-wise quantiles of the values in each group. Parameters @@ -2179,7 +2232,14 @@ def quantile(self, q=0.5, interpolation="linear"): interpolation : {"linear", "lower", "higher", "midpoint", "nearest"} The interpolation method to use when the desired quantile lies between two data points. Defaults to "linear". + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + Currently not supported """ + if numeric_only is not False: + raise NotImplementedError( + "numeric_only is not currently supported." + ) def func(x): return getattr(x, "quantile")(q=q, interpolation=interpolation) @@ -2333,7 +2393,14 @@ def fillna( ) @_performance_tracking - def shift(self, periods=1, freq=None, axis=0, fill_value=None): + def shift( + self, + periods=1, + freq=None, + axis=0, + fill_value=None, + suffix: str | None = None, + ): """ Shift each group by ``periods`` positions. @@ -2355,6 +2422,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): the list. The length of the list should match the number of columns shifted. Each value should match the data type of the column to fill. + suffix : str, optional + A string to add to each shifted column if there are multiple periods. + Ignored otherwise. + Currently not supported. Returns ------- @@ -2374,6 +2445,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): if not axis == 0: raise NotImplementedError("Only axis=0 is supported.") + if suffix is not None: + raise NotImplementedError("shift is not currently supported.") + values = self.grouping.values if is_list_like(fill_value): if len(fill_value) != len(values._data): @@ -2473,6 +2547,142 @@ def pct_change( shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 + def _mimic_pandas_order( + self, result: DataFrameOrSeries + ) -> DataFrameOrSeries: + """Given a groupby result from libcudf, reconstruct the row orders + matching that of pandas. This also adds appropriate indices. + """ + # TODO: copy metadata after this method is a common pattern, should + # merge in this method. + + # This function is used to reorder the results of scan-based + # groupbys which have the same output size as input size. + # However, if the grouping key has NAs and dropna=True, the + # result coming back from libcudf has null_count few rows than + # the input, so we must produce an ordering from the full + # input range. + _, _, (ordering,) = self._groupby.groups( + [as_column(range(0, len(self.obj)))] + ) + if self._dropna and any( + c.has_nulls(include_nan=True) > 0 + for c in self.grouping._key_columns + ): + # Scan aggregations with null/nan keys put nulls in the + # corresponding output rows in pandas, to do that here + # expand the result by reindexing. + ri = cudf.RangeIndex(0, len(self.obj)) + result.index = cudf.Index(ordering) + # This reorders and expands + result = result.reindex(ri) + else: + # Just reorder according to the groupings + result = result.take(ordering.argsort()) + # Now produce the actual index we first thought of + result.index = self.obj.index + return result + + def ohlc(self): + """ + Compute open, high, low and close values of a group, excluding missing values. + + Currently not implemented. + """ + raise NotImplementedError("ohlc is currently not implemented") + + @property + def plot(self): + """ + Make plots of a grouped Series or DataFrame. + + Currently not implemented. + """ + raise NotImplementedError("plot is currently not implemented") + + def resample(self, rule, *args, include_groups: bool = True, **kwargs): + """ + Provide resampling when using a TimeGrouper. + + Currently not implemented. + """ + raise NotImplementedError("resample is currently not implemented") + + def take(self, indices): + """ + Return the elements in the given *positional* indices in each group. + + Currently not implemented. + """ + raise NotImplementedError("take is currently not implemented") + + def filter(self, func, dropna: bool = True, *args, **kwargs): + """ + Filter elements from groups that don't satisfy a criterion. + + Currently not implemented. + """ + raise NotImplementedError("filter is currently not implemented") + + def expanding(self, *args, **kwargs): + """ + Return an expanding grouper, providing expanding + functionality per group. + + Currently not implemented. + """ + raise NotImplementedError("expanding is currently not implemented") + + def ewm(self, *args, **kwargs): + """ + Return an ewm grouper, providing ewm functionality per group. + + Currently not implemented. + """ + raise NotImplementedError("expanding is currently not implemented") + + def any(self, skipna: bool = True): + """ + Return True if any value in the group is truthful, else False. + + Currently not implemented. + """ + raise NotImplementedError("any is currently not implemented") + + def all(self, skipna: bool = True): + """ + Return True if all values in the group are truthful, else False. + + Currently not implemented. + """ + raise NotImplementedError("all is currently not implemented") + + +class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): + obj: "cudf.core.dataframe.DataFrame" + + _PROTECTED_KEYS = frozenset(("obj",)) + + def _reduce_numeric_only(self, op: str): + columns = list( + name + for name in self.obj._data.names + if ( + is_numeric_dtype(self.obj._data[name].dtype) + and name not in self.grouping.names + ) + ) + return self[columns].agg(op) + + def __getitem__(self, key): + return self.obj[key].groupby( + by=self.grouping.keys, + dropna=self._dropna, + sort=self._sort, + group_keys=self._group_keys, + as_index=self._as_index, + ) + def value_counts( self, subset=None, @@ -2637,68 +2847,112 @@ def value_counts( return result - def _mimic_pandas_order( - self, result: DataFrameOrSeries - ) -> DataFrameOrSeries: - """Given a groupby result from libcudf, reconstruct the row orders - matching that of pandas. This also adds appropriate indices. + @_performance_tracking + def corr( + self, method="pearson", min_periods=1, numeric_only: bool = False + ): """ - # TODO: copy metadata after this method is a common pattern, should - # merge in this method. + Compute pairwise correlation of columns, excluding NA/null values. - # This function is used to reorder the results of scan-based - # groupbys which have the same output size as input size. - # However, if the grouping key has NAs and dropna=True, the - # result coming back from libcudf has null_count few rows than - # the input, so we must produce an ordering from the full - # input range. - _, _, (ordering,) = self._groupby.groups( - [as_column(range(0, len(self.obj)))] - ) - if self._dropna and any( - c.has_nulls(include_nan=True) > 0 - for c in self.grouping._key_columns - ): - # Scan aggregations with null/nan keys put nulls in the - # corresponding output rows in pandas, to do that here - # expand the result by reindexing. - ri = cudf.RangeIndex(0, len(self.obj)) - result.index = cudf.Index(ordering) - # This reorders and expands - result = result.reindex(ri) - else: - # Just reorder according to the groupings - result = result.take(ordering.argsort()) - # Now produce the actual index we first thought of - result.index = self.obj.index - return result + Parameters + ---------- + method: {"pearson", "kendall", "spearman"} or callable, + default "pearson". Currently only the pearson correlation + coefficient is supported. + min_periods: int, optional + Minimum number of observations required per pair of columns + to have a valid result. -class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): - obj: "cudf.core.dataframe.DataFrame" + Returns + ------- + DataFrame + Correlation matrix. - _PROTECTED_KEYS = frozenset(("obj",)) + Examples + -------- + >>> import cudf + >>> gdf = cudf.DataFrame({ + ... "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + ... "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + ... "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + ... "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]}) + >>> gdf + id val1 val2 val3 + 0 a 5 4 4 + 1 a 4 5 5 + 2 a 6 6 6 + 3 b 4 1 1 + 4 b 8 2 2 + 5 b 7 9 9 + 6 c 4 8 8 + 7 c 5 5 5 + 8 c 2 1 1 + >>> gdf.groupby("id").corr(method="pearson") + val1 val2 val3 + id + a val1 1.000000 0.500000 0.500000 + val2 0.500000 1.000000 1.000000 + val3 0.500000 1.000000 1.000000 + b val1 1.000000 0.385727 0.385727 + val2 0.385727 1.000000 1.000000 + val3 0.385727 1.000000 1.000000 + c val1 1.000000 0.714575 0.714575 + val2 0.714575 1.000000 1.000000 + val3 0.714575 1.000000 1.000000 + """ - def _reduce_numeric_only(self, op: str): - columns = list( - name - for name in self.obj._data.names - if ( - is_numeric_dtype(self.obj._data[name].dtype) - and name not in self.grouping.names + if method != "pearson": + raise NotImplementedError( + "Only pearson correlation is currently supported" + ) + if numeric_only is not False: + raise NotImplementedError( + "numeric_only is currently not supported." ) - ) - return self[columns].agg(op) - def __getitem__(self, key): - return self.obj[key].groupby( - by=self.grouping.keys, - dropna=self._dropna, - sort=self._sort, - group_keys=self._group_keys, - as_index=self._as_index, + return self._cov_or_corr( + lambda x: x.corr(method, min_periods), "Correlation" ) + def hist( + self, + column=None, + by=None, + grid: bool = True, + xlabelsize: int | None = None, + xrot: float | None = None, + ylabelsize: int | None = None, + yrot: float | None = None, + ax=None, + sharex: bool = False, + sharey: bool = False, + figsize: tuple[float, float] | None = None, + layout: tuple[int, int] | None = None, + bins: int | abc.Sequence[int] = 10, + backend: str | None = None, + legend: bool = False, + **kwargs, + ): + raise NotImplementedError("hist is not currently implemented") + + def boxplot( + self, + subplots: bool = True, + column=None, + fontsize: int | None = None, + rot: int = 0, + grid: bool = True, + ax=None, + figsize: tuple[float, float] | None = None, + layout=None, + sharex: bool = False, + sharey: bool = True, + backend=None, + **kwargs, + ): + raise NotImplementedError("boxplot is not currently implemented") + DataFrameGroupBy.__doc__ = groupby_doc_template.format(ret="") @@ -2706,8 +2960,10 @@ def __getitem__(self, key): class SeriesGroupBy(GroupBy): obj: "cudf.core.series.Series" - def agg(self, func): - result = super().agg(func) + def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + result = super().agg( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) # downcast the result to a Series: if len(result._data): @@ -2722,14 +2978,95 @@ def agg(self, func): aggregate = agg - def apply(self, func, *args): - result = super().apply(func, *args) + def apply(self, func, *args, **kwargs): + result = super().apply(func, *args, **kwargs) # apply Series name to result result.name = self.obj.name return result + @property + def dtype(self) -> pd.Series: + raise NotImplementedError("dtype is currently not implemented.") + + def hist( + self, + by=None, + ax=None, + grid: bool = True, + xlabelsize: int | None = None, + xrot: float | None = None, + ylabelsize: int | None = None, + yrot: float | None = None, + figsize: tuple[float, float] | None = None, + bins: int | abc.Sequence[int] = 10, + backend: str | None = None, + legend: bool = False, + **kwargs, + ): + raise NotImplementedError("hist is currently not implemented.") + + @property + def is_monotonic_increasing(self) -> cudf.Series: + """ + Return whether each group's values are monotonically increasing. + + Currently not implemented + """ + raise NotImplementedError( + "is_monotonic_increasing is currently not implemented." + ) + + @property + def is_monotonic_decreasing(self) -> cudf.Series: + """ + Return whether each group's values are monotonically decreasing. + + Currently not implemented + """ + raise NotImplementedError( + "is_monotonic_decreasing is currently not implemented." + ) + + def nlargest( + self, n: int = 5, keep: Literal["first", "last", "all"] = "first" + ) -> cudf.Series: + """ + Return the largest n elements. + + Currently not implemented + """ + raise NotImplementedError("nlargest is currently not implemented.") + + def nsmallest( + self, n: int = 5, keep: Literal["first", "last", "all"] = "first" + ) -> cudf.Series: + """ + Return the smallest n elements. + + Currently not implemented + """ + raise NotImplementedError("nsmallest is currently not implemented.") + + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins=None, + dropna: bool = True, + ) -> cudf.Series | cudf.DataFrame: + raise NotImplementedError("value_counts is currently not implemented.") + + def corr( + self, + other: cudf.Series, + method: str = "pearson", + min_periods: int | None = None, + ) -> cudf.Series: + raise NotImplementedError("corr is currently not implemented.") + SeriesGroupBy.__doc__ = groupby_doc_template.format(ret="") diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index 4e0c5bd86b9..715bbf89b15 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -43,8 +43,10 @@ def __init__(self, obj, by, axis=None, kind=None): by = _ResampleGrouping(obj, by) super().__init__(obj, by=by) - def agg(self, func): - result = super().agg(func) + def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + result = super().agg( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) if len(self.grouping.bin_labels) != len(result): index = cudf.core.index.Index( self.grouping.bin_labels, name=self.grouping.names[0] diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 826a0e52f57..74f04c0584f 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3885,3 +3885,28 @@ def test_group_by_raises_category_error(op): with pytest.raises(TypeError): df.groupby(df.a).agg(op) + + +def test_ngroups(): + pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)}) + gdf = cudf.DataFrame.from_pandas(pdf) + + pgb = pdf.groupby("a") + ggb = gdf.groupby("a") + assert pgb.ngroups == ggb.ngroups + assert len(pgb) == len(ggb) + + +def test_ndim(): + pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)}) + gdf = cudf.DataFrame.from_pandas(pdf) + + pgb = pdf.groupby("a") + ggb = gdf.groupby("a") + assert pgb.ndim == ggb.ndim + + pser = pd.Series(range(3)) + gser = cudf.Series.from_pandas(pser) + pgb = pser.groupby([0, 0, 1]) + ggb = gser.groupby(cudf.Series([0, 0, 1])) + assert pgb.ndim == ggb.ndim From 6e7624d6b31c93b0547590929ac63ed8e3a48d24 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 29 Jul 2024 14:06:51 -0400 Subject: [PATCH 09/12] Add stream parameter to reshape APIs (#16410) Adds `stream` parameter to reshape APIs: - `cudf::interleave_columns` - `cudf::tile` - `cudf::byte_cast` Found while working #15983 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/16410 --- cpp/include/cudf/detail/reshape.hpp | 4 --- cpp/include/cudf/reshape.hpp | 17 ++++++---- cpp/src/reshape/byte_cast.cu | 11 ++----- cpp/src/reshape/interleave_columns.cu | 3 +- cpp/src/reshape/tile.cu | 3 +- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/reshape_test.cpp | 47 +++++++++++++++++++++++++++ 7 files changed, 65 insertions(+), 21 deletions(-) create mode 100644 cpp/tests/streams/reshape_test.cpp diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp index 30f8b88b116..68a856373bf 100644 --- a/cpp/include/cudf/detail/reshape.hpp +++ b/cpp/include/cudf/detail/reshape.hpp @@ -28,8 +28,6 @@ namespace CUDF_EXPORT cudf { namespace detail { /** * @copydoc cudf::tile - * - * @param stream CUDA stream used for device memory operations and kernel launches */ std::unique_ptr tile(table_view const& input, size_type count, @@ -38,8 +36,6 @@ std::unique_ptr
tile(table_view const& input, /** * @copydoc cudf::interleave_columns - * - * @param stream CUDA stream used for device memory operations and kernel launches */ std::unique_ptr interleave_columns(table_view const& input, rmm::cuda_stream_view, diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp index a0a7fe694bb..07aaf6488ad 100644 --- a/cpp/include/cudf/reshape.hpp +++ b/cpp/include/cudf/reshape.hpp @@ -47,13 +47,14 @@ namespace CUDF_EXPORT cudf { * @throws cudf::logic_error if input contains no columns. * @throws cudf::logic_error if input columns dtypes are not identical. * - * @param[in] input Table containing columns to interleave - * @param[in] mr Device memory resource used to allocate the returned column's device memory - * + * @param input Table containing columns to interleave + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory * @return The interleaved columns as a single column */ std::unique_ptr interleave_columns( table_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @@ -68,15 +69,17 @@ std::unique_ptr interleave_columns( * return = [[8, 4, 7, 8, 4, 7], [5, 2, 3, 5, 2, 3]] * ``` * - * @param[in] input Table containing rows to be repeated - * @param[in] count Number of times to tile "rows". Must be non-negative - * @param[in] mr Device memory resource used to allocate the returned table's device memory + * @param input Table containing rows to be repeated + * @param count Number of times to tile "rows". Must be non-negative + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table's device memory * * @return The table containing the tiled "rows" */ std::unique_ptr
tile( table_view const& input, size_type count, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @@ -95,6 +98,7 @@ enum class flip_endianness : bool { NO, YES }; * * @param input_column Column to be converted to lists of bytes * @param endian_configuration Whether to retain or flip the endianness of the elements + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @return The column containing the lists of bytes @@ -102,6 +106,7 @@ enum class flip_endianness : bool { NO, YES }; std::unique_ptr byte_cast( column_view const& input_column, flip_endianness endian_configuration, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu index 3dfa0b65814..2a03a5504c1 100644 --- a/cpp/src/reshape/byte_cast.cu +++ b/cpp/src/reshape/byte_cast.cu @@ -167,11 +167,6 @@ struct byte_list_conversion_fn byte_cast(column_view const& input, flip_endianness endian_configuration, rmm::cuda_stream_view stream, @@ -183,15 +178,13 @@ std::unique_ptr byte_cast(column_view const& input, } // namespace detail -/** - * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref) - */ std::unique_ptr byte_cast(column_view const& input, flip_endianness endian_configuration, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::byte_cast(input, endian_configuration, cudf::get_default_stream(), mr); + return detail::byte_cast(input, endian_configuration, stream, mr); } } // namespace cudf diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu index 79124508b11..7473b6045af 100644 --- a/cpp/src/reshape/interleave_columns.cu +++ b/cpp/src/reshape/interleave_columns.cu @@ -264,10 +264,11 @@ std::unique_ptr interleave_columns(table_view const& input, } // namespace detail std::unique_ptr interleave_columns(table_view const& input, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::interleave_columns(input, cudf::get_default_stream(), mr); + return detail::interleave_columns(input, stream, mr); } } // namespace cudf diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu index 29996aa2152..3d4fb73c000 100644 --- a/cpp/src/reshape/tile.cu +++ b/cpp/src/reshape/tile.cu @@ -64,10 +64,11 @@ std::unique_ptr
tile(table_view const& in, std::unique_ptr
tile(table_view const& in, size_type count, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::tile(in, count, cudf::get_default_stream(), mr); + return detail::tile(in, count, stream, mr); } } // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 22827484f9a..4dffcb41ba2 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -704,6 +704,7 @@ ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE test ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing) ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/reshape_test.cpp b/cpp/tests/streams/reshape_test.cpp new file mode 100644 index 00000000000..d7c5da91bca --- /dev/null +++ b/cpp/tests/streams/reshape_test.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +class ReshapeTest : public cudf::test::BaseFixture {}; + +TEST_F(ReshapeTest, InterleaveColumns) +{ + auto a = cudf::test::fixed_width_column_wrapper({0, 3, 6}); + auto b = cudf::test::fixed_width_column_wrapper({1, 4, 7}); + auto c = cudf::test::fixed_width_column_wrapper({2, 5, 8}); + cudf::table_view in(std::vector{a, b, c}); + cudf::interleave_columns(in, cudf::test::get_default_stream()); +} + +TEST_F(ReshapeTest, Tile) +{ + auto a = cudf::test::fixed_width_column_wrapper({-1, 0, 1}); + cudf::table_view in(std::vector{a}); + cudf::tile(in, 2, cudf::test::get_default_stream()); +} + +TEST_F(ReshapeTest, ByteCast) +{ + auto a = cudf::test::fixed_width_column_wrapper({0, 100, -100, 1000, 1000}); + cudf::byte_cast(a, cudf::flip_endianness::YES, cudf::test::get_default_stream()); + cudf::byte_cast(a, cudf::flip_endianness::NO, cudf::test::get_default_stream()); +} From 35796057b64e258713d4d89ba368837d30a1a9c5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Jul 2024 08:33:23 -1000 Subject: [PATCH 10/12] Align misc DataFrame and MultiIndex methods with pandas 2.x (#16402) The API changes in this PR are mostly adding implementations or adding missing keyword argument (although they might not be implemented). The APIs affected are: * `DataFrame.insert` * `DataFrame.melt` * `DataFrame.merge` * `DataFrame.quantile` * `DataFrame.cov` * `DataFrame.corr` * `DataFrame.median` * `DataFrame.rolling` * `DataFrame.resample` * `DataFrame.dropna` * `MultiIndex.from_tuple` * `MultiIndex.from_frame` * `MultiIndex.from_product` Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16402 --- python/cudf/cudf/core/dataframe.py | 106 +++++++++++++++++------- python/cudf/cudf/core/indexed_frame.py | 81 +++++++++++------- python/cudf/cudf/core/multiindex.py | 38 +++++++-- python/cudf/cudf/core/reshape.py | 3 + python/cudf/cudf/core/window/ewm.py | 23 +++-- python/cudf/cudf/core/window/rolling.py | 27 +++++- python/cudf/cudf/tests/test_dropna.py | 9 ++ 7 files changed, 211 insertions(+), 76 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1d7136e61e3..6ea11fe9f64 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3215,26 +3215,37 @@ def reset_index( ) @_performance_tracking - def insert(self, loc, name, value, nan_as_null=no_default): + def insert( + self, + loc, + column, + value, + allow_duplicates: bool = False, + nan_as_null=no_default, + ): """Add a column to DataFrame at the index specified by loc. Parameters ---------- loc : int location to insert by index, cannot be greater then num columns + 1 - name : number or string - name or label of column to be inserted + column : number or string + column or label of column to be inserted value : Series or array-like nan_as_null : bool, Default None If ``None``/``True``, converts ``np.nan`` values to ``null`` values. If ``False``, leaves ``np.nan`` values as is. """ + if allow_duplicates is not False: + raise NotImplementedError( + "allow_duplicates is currently not implemented." + ) if nan_as_null is no_default: nan_as_null = not cudf.get_option("mode.pandas_compatible") return self._insert( loc=loc, - name=name, + name=column, value=value, nan_as_null=nan_as_null, ignore_index=False, @@ -4097,7 +4108,15 @@ def transpose(self): T = property(transpose, doc=transpose.__doc__) @_performance_tracking - def melt(self, **kwargs): + def melt( + self, + id_vars=None, + value_vars=None, + var_name=None, + value_name="value", + col_level=None, + ignore_index: bool = True, + ): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. @@ -4124,23 +4143,30 @@ def melt(self, **kwargs): """ from cudf.core.reshape import melt - return melt(self, **kwargs) + return melt( + self, + id_vars=id_vars, + value_vars=value_vars, + var_name=var_name, + value_name=value_name, + col_level=col_level, + ignore_index=ignore_index, + ) @_performance_tracking def merge( self, right, + how="inner", on=None, left_on=None, right_on=None, left_index=False, right_index=False, - how="inner", sort=False, - lsuffix=None, - rsuffix=None, - indicator=False, suffixes=("_x", "_y"), + indicator=False, + validate=None, ): """Merge GPU DataFrame objects by performing a database-style join operation by columns or indexes. @@ -4241,17 +4267,8 @@ def merge( raise NotImplementedError( "Only indicator=False is currently supported" ) - - if lsuffix or rsuffix: - raise ValueError( - "The lsuffix and rsuffix keywords have been replaced with the " - "``suffixes=`` keyword. " - "Please provide the following instead: \n\n" - " suffixes=('%s', '%s')" - % (lsuffix or "_x", rsuffix or "_y") - ) - else: - lsuffix, rsuffix = suffixes + if validate is not None: + raise NotImplementedError("validate is currently not supported.") lhs, rhs = self, right merge_cls = Merge @@ -5952,9 +5969,9 @@ def quantile( axis=0, numeric_only=True, interpolation=None, + method="single", columns=None, exact=True, - method="single", ): """ Return values at the given quantile. @@ -5980,14 +5997,14 @@ def quantile( * higher: `j`. * nearest: `i` or `j` whichever is nearest. * midpoint: (`i` + `j`) / 2. - columns : list of str - List of column names to include. - exact : boolean - Whether to use approximate or exact quantile algorithm. method : {'single', 'table'}, default `'single'` Whether to compute quantiles per-column ('single') or over all columns ('table'). When 'table', the only allowed interpolation methods are 'nearest', 'lower', and 'higher'. + columns : list of str + List of column names to include. + exact : boolean + Whether to use approximate or exact quantile algorithm. Returns ------- @@ -7309,25 +7326,47 @@ def unnamed_group_generator(): return result @_performance_tracking - def cov(self, **kwargs): + def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False): """Compute the covariance matrix of a DataFrame. Parameters ---------- - **kwargs - Keyword arguments to be passed to cupy.cov + min_periods : int, optional + Minimum number of observations required per pair of columns to + have a valid result. + Currently not supported. + + ddof : int, default 1 + Delta degrees of freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + Currently not supported. Returns ------- cov : DataFrame """ - cov = cupy.cov(self.values, rowvar=False) + if min_periods is not None: + raise NotImplementedError( + "min_periods is currently not supported." + ) + + if numeric_only is not False: + raise NotImplementedError( + "numeric_only is currently not supported." + ) + + cov = cupy.cov(self.values, ddof=ddof, rowvar=False) cols = self._data.to_pandas_index() df = DataFrame(cupy.asfortranarray(cov)).set_index(cols) df._set_columns_like(self._data) return df - def corr(self, method="pearson", min_periods=None): + def corr( + self, method="pearson", min_periods=None, numeric_only: bool = False + ): """Compute the correlation matrix of a DataFrame. Parameters @@ -7357,6 +7396,11 @@ def corr(self, method="pearson", min_periods=None): if min_periods is not None: raise NotImplementedError("Unsupported argument 'min_periods'") + if numeric_only is not False: + raise NotImplementedError( + "numeric_only is currently not supported." + ) + corr = cupy.corrcoef(values, rowvar=False) cols = self._data.to_pandas_index() df = DataFrame(cupy.asfortranarray(corr)).set_index(cols) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e14f8923c25..0678ebfdd81 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1495,9 +1495,7 @@ def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs): **kwargs, ) - def median( - self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs - ): + def median(self, axis=None, skipna=True, numeric_only=None, **kwargs): """ Return the median of the values for the requested axis. @@ -1857,7 +1855,16 @@ def mask( @_performance_tracking @copy_docstring(Rolling) def rolling( - self, window, min_periods=None, center=False, axis=0, win_type=None + self, + window, + min_periods=None, + center: bool = False, + win_type: str | None = None, + on=None, + axis=0, + closed: str | None = None, + step: int | None = None, + method: str = "single", ): return Rolling( self, @@ -1865,7 +1872,11 @@ def rolling( min_periods=min_periods, center=center, axis=axis, + on=on, win_type=win_type, + closed=closed, + step=step, + method=method, ) @copy_docstring(ExponentialMovingWindow) @@ -1880,6 +1891,7 @@ def ewm( ignore_na: bool = False, axis: int = 0, times: str | np.ndarray | None = None, + method: Literal["single", "table"] = "single", ): return ExponentialMovingWindow( self, @@ -1892,6 +1904,7 @@ def ewm( ignore_na=ignore_na, axis=axis, times=times, + method=method, ) @_performance_tracking @@ -3943,16 +3956,15 @@ def resample( self, rule, axis=0, - closed=None, - label=None, - convention="start", + closed: Literal["right", "left"] | None = None, + label: Literal["right", "left"] | None = None, + convention: Literal["start", "end", "s", "e"] = "start", kind=None, - loffset=None, - base=None, on=None, level=None, origin="start_day", offset=None, + group_keys: bool = False, ): """ Convert the frequency of ("resample") the given time series data. @@ -4090,26 +4102,27 @@ def resample( "deprecated and will be removed in a future version. ", FutureWarning, ) - if (axis, convention, kind, loffset, base, origin, offset) != ( - 0, - "start", - None, - None, - None, - "start_day", - None, - ): - raise NotImplementedError( - "The following arguments are not " - "currently supported by resample:\n\n" - "- axis\n" - "- convention\n" - "- kind\n" - "- loffset\n" - "- base\n" - "- origin\n" - "- offset" + raise NotImplementedError("kind is currently not supported.") + if axis != 0: + warnings.warn( + "The 'axis' keyword in is " + "deprecated and will be removed in a future version. ", + FutureWarning, ) + raise NotImplementedError("axis is currently not supported.") + if convention != "start": + warnings.warn( + "The 'convention' keyword in is " + "deprecated and will be removed in a future version. ", + FutureWarning, + ) + raise NotImplementedError("convention is currently not supported.") + if origin != "start_day": + raise NotImplementedError("origin is currently not supported.") + if offset is not None: + raise NotImplementedError("offset is currently not supported.") + if group_keys is not False: + raise NotImplementedError("group_keys is currently not supported.") by = cudf.Grouper( key=on, freq=rule, closed=closed, label=label, level=level ) @@ -4120,7 +4133,13 @@ def resample( ) def dropna( - self, axis=0, how="any", thresh=None, subset=None, inplace=False + self, + axis=0, + how="any", + thresh=None, + subset=None, + inplace=False, + ignore_index: bool = False, ): """ Drop rows (or columns) containing nulls from a Column. @@ -4144,6 +4163,8 @@ def dropna( columns, subset is a list of rows to consider. inplace : bool, default False If True, do operation inplace and return None. + ignore_index : bool, default ``False`` + If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. Returns ------- @@ -4220,6 +4241,8 @@ def dropna( """ if axis == 0: result = self._drop_na_rows(how=how, subset=subset, thresh=thresh) + if ignore_index: + result.index = RangeIndex(len(result)) else: result = self._drop_na_columns( how=how, subset=subset, thresh=thresh diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index dfc596bf279..0e1fddd7ed5 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -524,8 +524,10 @@ def codes(self): col.values for col in self._codes ) - def get_slice_bound(self, label, side, kind=None): - raise NotImplementedError() + def get_slice_bound(self, label, side): + raise NotImplementedError( + "get_slice_bound is not currently implemented." + ) @property # type: ignore @_performance_tracking @@ -1108,7 +1110,7 @@ def _concat(cls, objs): @classmethod @_performance_tracking - def from_tuples(cls, tuples, names=None): + def from_tuples(cls, tuples, sortorder: int | None = None, names=None): """ Convert list of tuples to MultiIndex. @@ -1116,6 +1118,9 @@ def from_tuples(cls, tuples, names=None): ---------- tuples : list / sequence of tuple-likes Each tuple is the index of one row/column. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). names : list / sequence of str, optional Names for the levels in the index. @@ -1142,7 +1147,9 @@ def from_tuples(cls, tuples, names=None): names=['number', 'color']) """ # Use Pandas for handling Python host objects - pdi = pd.MultiIndex.from_tuples(tuples, names=names) + pdi = pd.MultiIndex.from_tuples( + tuples, sortorder=sortorder, names=names + ) return cls.from_pandas(pdi) @_performance_tracking @@ -1215,7 +1222,12 @@ def values(self): @classmethod @_performance_tracking - def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None): + def from_frame( + cls, + df: pd.DataFrame | cudf.DataFrame, + sortorder: int | None = None, + names=None, + ): """ Make a MultiIndex from a DataFrame. @@ -1223,6 +1235,9 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None): ---------- df : DataFrame DataFrame to be converted to MultiIndex. + sortorder : int, optional + Level of sortedness (must be lexicographically sorted by that + level). names : list-like, optional If no names are provided, use the column names, or tuple of column names if the columns is a MultiIndex. If a sequence, overwrite @@ -1273,11 +1288,13 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None): else: source_data = df names = names if names is not None else source_data._column_names - return cls.from_arrays(source_data._columns, names=names) + return cls.from_arrays( + source_data._columns, sortorder=sortorder, names=names + ) @classmethod @_performance_tracking - def from_product(cls, arrays, names=None): + def from_product(cls, iterables, sortorder: int | None = None, names=None): """ Make a MultiIndex from the cartesian product of multiple iterables. @@ -1285,6 +1302,9 @@ def from_product(cls, arrays, names=None): ---------- iterables : list / sequence of iterables Each iterable has unique labels for each level of the index. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). names : list / sequence of str, optional Names for the levels in the index. If not explicitly provided, names will be inferred from the @@ -1314,7 +1334,9 @@ def from_product(cls, arrays, names=None): names=['number', 'color']) """ # Use Pandas for handling Python host objects - pdi = pd.MultiIndex.from_product(arrays, names=names) + pdi = pd.MultiIndex.from_product( + iterables, sortorder=sortorder, names=names + ) return cls.from_pandas(pdi) @classmethod diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index a542c5f5969..e7248977b1d 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -502,6 +502,7 @@ def melt( var_name=None, value_name="value", col_level=None, + ignore_index: bool = True, ): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. @@ -566,6 +567,8 @@ def melt( """ if col_level is not None: raise NotImplementedError("col_level != None is not supported yet.") + if ignore_index is not True: + raise NotImplementedError("ignore_index is currently not supported.") # Arg cleaning diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py index 1203a840076..ef0f6958aeb 100644 --- a/python/cudf/cudf/core/window/ewm.py +++ b/python/cudf/cudf/core/window/ewm.py @@ -1,7 +1,9 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. - from __future__ import annotations +import warnings +from typing import Literal + import numpy as np from cudf._lib.reduce import scan @@ -103,13 +105,24 @@ def __init__( ignore_na: bool = False, axis: int = 0, times: str | np.ndarray | None = None, + method: Literal["single", "table"] = "single", ): - if (min_periods, ignore_na, axis, times) != (0, False, 0, None): + if min_periods != 0: raise NotImplementedError( - "The parameters `min_periods`, `ignore_na`, " - "`axis`, and `times` are not yet supported." + "min_periods is currently not supported." ) - + if ignore_na is not False: + raise NotImplementedError("ignore_na is currently not supported.") + if axis != 0: + warnings.warn( + "axis is deprecated with will be removed in a future version. " + "Transpose the DataFrame first instead." + ) + raise NotImplementedError("axis is currently not supported.") + if times is not None: + raise NotImplementedError("times is currently not supported.") + if method != "single": + raise NotImplementedError("method is currently not supported.") self.obj = obj self.adjust = adjust self.com = get_center_of_mass(com, span, halflife, alpha) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 29391c68471..043a41145e5 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,4 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION +from __future__ import annotations + +import warnings import numba import pandas as pd @@ -196,17 +199,26 @@ def __init__( obj, window, min_periods=None, - center=False, + center: bool = False, + win_type: str | None = None, + on=None, axis=0, - win_type=None, + closed: str | None = None, + step: int | None = None, + method: str = "single", ): self.obj = obj self.window = window self.min_periods = min_periods self.center = center self._normalize() - self.agg_params = {} + # for var & std only? + self.agg_params: dict[str, int] = {} if axis != 0: + warnings.warn( + "axis is deprecated with will be removed in a future version. " + "Transpose the DataFrame first instead." + ) raise NotImplementedError("axis != 0 is not supported yet.") self.axis = axis @@ -217,6 +229,15 @@ def __init__( ) self.win_type = win_type + if on is not None: + raise NotImplementedError("on is currently not supported") + if closed not in (None, "right"): + raise NotImplementedError("closed is currently not supported") + if step is not None: + raise NotImplementedError("step is currently not supported") + if method != "single": + raise NotImplementedError("method is currently not supported") + def __getitem__(self, arg): if isinstance(arg, tuple): arg = list(arg) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index ed0cf0053ea..5b1ee0ffac6 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -284,3 +284,12 @@ def test_dropna_multiindex_2(data, how): got = gi.dropna(how) assert_eq(expect, got) + + +def test_ignore_index(): + pser = pd.Series([1, 2, np.nan], index=[2, 4, 1]) + gser = cudf.from_pandas(pser) + + result = pser.dropna(ignore_index=True) + expected = gser.dropna(ignore_index=True) + assert_eq(result, expected) From 743e16426c564d0ed0d7e3d9be5f67e4605c4f32 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 29 Jul 2024 14:19:43 -0500 Subject: [PATCH 11/12] update some branch references in GitHub Actions configs (#16397) Fixes some lingering references to `branch-24.08` in the `pr_issue_status_automation` CI workflow. This was missed when new branches were cut because that file ends in `.yml` and `update-version.sh` was only modifying files ending in `.yaml`. The corresponding `update-version.sh` changes were made in #16183 and are already on 24.10 thanks to forward mergers. https://github.com/rapidsai/cudf/blob/dc05a01f3fc0742c5fbbddd86a0f2007bfdc2050/ci/release/update-version.sh#L78 ## Notes for Reviewers I checked like this, and don't see any other missed references: ```shell git grep -E '24\.8|24\.08|0\.39' ``` Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/16397 --- .github/workflows/pr_issue_status_automation.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index 8ca971dc28d..45e5191eb54 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10 if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: From f8eb63e499f94d583d715f5c1f5e6f234589be57 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Jul 2024 12:39:19 -1000 Subject: [PATCH 12/12] Align Index APIs with pandas 2.x (#16361) Similar to https://github.com/rapidsai/cudf/pull/16310, the follow APIs have been modified to adjust/add parameters * `to_flat_index` * `isin` * `unique` * `transpose` Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16361 --- docs/cudf/source/conf.py | 5 ++++ python/cudf/cudf/core/_base_index.py | 25 ++++++++++++++++++-- python/cudf/cudf/core/index.py | 24 +++++++++++++++---- python/cudf/cudf/core/multiindex.py | 16 +++++++++++-- python/cudf/cudf/core/series.py | 8 ------- python/cudf/cudf/core/single_column_frame.py | 7 ++++++ python/cudf/cudf/tests/test_multiindex.py | 9 +++++++ 7 files changed, 78 insertions(+), 16 deletions(-) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index f544536fb31..7421d9be298 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -561,6 +561,11 @@ def on_missing_reference(app, env, node, contnode): ("py:class", "ScalarLike"), ("py:class", "ParentType"), ("py:class", "ColumnLike"), + ("py:class", "ColumnLike"), + ("py:obj", "cudf.Index.transpose"), + ("py:obj", "cudf.Index.T"), + ("py:obj", "cudf.Index.to_flat_index"), + ("py:obj", "cudf.MultiIndex.to_flat_index"), # TODO: Remove this when we figure out why typing_extensions doesn't seem # to map types correctly for intersphinx ("py:class", "typing_extensions.Self"), diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 8fad82c5c46..c91514202c5 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -868,6 +868,24 @@ def to_numpy(self): """Convert to a numpy array.""" raise NotImplementedError + def to_flat_index(self) -> Self: + """ + Identity method. + + This is implemented for compatibility with subclass implementations + when chaining. + + Returns + ------- + pd.Index + Caller. + + See Also + -------- + MultiIndex.to_flat_index : Subclass implementation. + """ + return self + def any(self): """ Return whether any elements is True in Index. @@ -945,7 +963,7 @@ def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False): """ raise NotImplementedError - def isin(self, values): + def isin(self, values, level=None): """Return a boolean array where the index values are in values. Compute boolean array of whether each index value is found in @@ -956,6 +974,9 @@ def isin(self, values): ---------- values : set, list-like, Index Sought values. + level : str or int, optional + Name or position of the index level to use (if the index is a + `MultiIndex`). Returns ------- @@ -979,7 +1000,7 @@ def isin(self, values): # ColumnBase.isin). raise NotImplementedError - def unique(self): + def unique(self, level: int | None = None): """ Return unique values in the index. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1c48b8f4f2d..156cb973a9a 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -540,8 +540,12 @@ def memory_usage(self, deep: bool = False) -> int: ) return 0 - def unique(self) -> Self: + def unique(self, level: int | None = None) -> Self: # RangeIndex always has unique values + if level is not None and level > 0: + raise IndexError( + f"Too many levels: Index has only 1 level, not {level + 1}" + ) return self.copy() @_performance_tracking @@ -964,7 +968,11 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn: i = [] return as_column(i, dtype=size_type_dtype) - def isin(self, values): + def isin(self, values, level=None): + if level is not None and level > 0: + raise IndexError( + f"Too many levels: Index has only 1 level, not {level + 1}" + ) if is_scalar(values): raise TypeError( "only list-like objects are allowed to be passed " @@ -1616,12 +1624,20 @@ def append(self, other): return self._concat(to_concat) - def unique(self): + def unique(self, level: int | None = None) -> Self: + if level is not None and level > 0: + raise IndexError( + f"Too many levels: Index has only 1 level, not {level + 1}" + ) return cudf.core.index._index_from_data( {self.name: self._values.unique()}, name=self.name ) - def isin(self, values): + def isin(self, values, level=None): + if level is not None and level > 0: + raise IndexError( + f"Too many levels: Index has only 1 level, not {level + 1}" + ) if is_scalar(values): raise TypeError( "only list-like objects are allowed to be passed " diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 0e1fddd7ed5..2788455aebf 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1156,6 +1156,15 @@ def from_tuples(cls, tuples, sortorder: int | None = None, names=None): def to_numpy(self): return self.values_host + def to_flat_index(self): + """ + Convert a MultiIndex to an Index of Tuples containing the level values. + + This is not currently implemented + """ + # TODO: Could implement as Index of ListDtype? + raise NotImplementedError("to_flat_index is not currently supported.") + @property # type: ignore @_performance_tracking def values_host(self): @@ -1734,8 +1743,11 @@ def fillna(self, value): return super().fillna(value=value) @_performance_tracking - def unique(self): - return self.drop_duplicates(keep="first") + def unique(self, level: int | None = None) -> Self | cudf.Index: + if level is None: + return self.drop_duplicates(keep="first") + else: + return self.get_level_values(level).unique() @_performance_tracking def nunique(self, dropna: bool = True) -> int: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8277ccf68fc..10ac1fdfc1e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2775,14 +2775,6 @@ def cov(self, other, min_periods=None, ddof: int | None = None): f"{other.dtype}" ) - @_performance_tracking - def transpose(self): - """Return the transpose, which is by definition self.""" - - return self - - T = property(transpose, doc=transpose.__doc__) - @_performance_tracking def duplicated(self, keep="first"): """ diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index b93528f9693..a5ff1223791 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -389,3 +389,10 @@ def where(self, cond, other=None, inplace=False): result = cudf._lib.copying.copy_if_else(input_col, other, cond) return _make_categorical_like(result, self_column) + + @_performance_tracking + def transpose(self): + """Return the transpose, which is by definition self.""" + return self + + T = property(transpose, doc=transpose.__doc__) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 2c00d48266c..b7314a36e73 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -2170,3 +2170,12 @@ def test_bool_raises(): lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]], rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]], ) + + +def test_unique_level(): + pd_mi = pd.MultiIndex.from_arrays([[1, 1, 2], [3, 3, 2]]) + cudf_mi = cudf.MultiIndex.from_pandas(pd_mi) + + result = pd_mi.unique(level=1) + expected = cudf_mi.unique(level=1) + assert_eq(result, expected)