From 31b33b90430a4f2496fcf1a42778bcd8e070c87c Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 13 Jun 2024 08:58:02 +0100 Subject: [PATCH 01/25] Add tests of implemented StringFunctions (#16007) Additionally, assert that we raise during translation for an unhandled function. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/16007 --- python/cudf_polars/cudf_polars/dsl/expr.py | 4 +- .../tests/expressions/test_stringfunction.py | 41 +++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 python/cudf_polars/tests/expressions/test_stringfunction.py diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 377a905aed6..298ef5ab070 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -691,7 +691,9 @@ def do_evaluate( ) ) else: - raise NotImplementedError(f"StringFunction {self.name}") + raise NotImplementedError( + f"StringFunction {self.name}" + ) # pragma: no cover; handled by init raising class Sort(Expr): diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py new file mode 100644 index 00000000000..198f35d376b --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars import translate_ir +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_supported_stringfunction_expression(): + ldf = pl.LazyFrame( + { + "a": ["a", "b", "cdefg", "h", "Wıth ünιcοde"], # noqa: RUF001 + "b": [0, 3, 1, -1, None], + } + ) + + query = ldf.select( + pl.col("a").str.starts_with("Z"), + pl.col("a").str.ends_with("h").alias("endswith_h"), + pl.col("a").str.to_lowercase().alias("lower"), + pl.col("a").str.to_uppercase().alias("upper"), + ) + assert_gpu_result_equal(query) + + +def test_unsupported_stringfunction(): + ldf = pl.LazyFrame( + { + "a": ["a", "b", "cdefg", "h", "Wıth ünιcοde"], # noqa: RUF001 + "b": [0, 3, 1, -1, None], + } + ) + + q = ldf.select(pl.col("a").str.count_matches("e", literal=True)) + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) From 8bbc5121b2dec93d24337d399ff6616bbb971a06 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 13 Jun 2024 08:58:27 +0100 Subject: [PATCH 02/25] Add coverage selecting len from a dataframe (number of rows) (#16005) Fix bug (and report a polars issue) for the case that the dataframe is empty, and therefore we cannot ask a column for its length. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16005 --- .../cudf_polars/containers/dataframe.py | 2 +- .../cudf_polars/tests/expressions/test_len.py | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 python/cudf_polars/tests/expressions/test_len.py diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index d1f7a9ed2cf..ec8d00c3123 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -70,7 +70,7 @@ def num_columns(self) -> int: @cached_property def num_rows(self) -> int: """Number of rows.""" - return self.table.num_rows() + return 0 if len(self.columns) == 0 else self.table.num_rows() @classmethod def from_cudf(cls, df: cudf.DataFrame) -> Self: diff --git a/python/cudf_polars/tests/expressions/test_len.py b/python/cudf_polars/tests/expressions/test_len.py new file mode 100644 index 00000000000..03b30928184 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_len.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize("dtype", [pl.UInt32, pl.Int32, None]) +@pytest.mark.parametrize("empty", [False, True]) +def test_len(dtype, empty): + if empty: + df = pl.LazyFrame({}) + else: + df = pl.LazyFrame({"a": [1, 2, 3]}) + + if dtype is None: + q = df.select(pl.len()) + else: + q = df.select(pl.len().cast(dtype)) + + # Workaround for https://github.com/pola-rs/polars/issues/16904 + assert_gpu_result_equal(q, collect_kwargs={"projection_pushdown": False}) From af09d3e60e4ac4c86602e4e47e58cdb47a02b22c Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 13 Jun 2024 08:58:46 +0100 Subject: [PATCH 03/25] Raise early on unhandled PythonScan node (#15992) Add test of the behaviour. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15992 --- python/cudf_polars/cudf_polars/dsl/ir.py | 4 ++++ python/cudf_polars/tests/test_python_scan.py | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 python/cudf_polars/tests/test_python_scan.py diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 46241ab8e71..9fb2468e4e9 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -165,6 +165,10 @@ class PythonScan(IR): predicate: expr.NamedExpr | None """Filter to apply to the constructed dataframe before returning it.""" + def __post_init__(self): + """Validate preconditions.""" + raise NotImplementedError("PythonScan not implemented") + @dataclasses.dataclass(slots=True) class Scan(IR): diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py new file mode 100644 index 00000000000..c03474e3dc8 --- /dev/null +++ b/python/cudf_polars/tests/test_python_scan.py @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars import translate_ir + + +def test_python_scan(): + def source(with_columns, predicate, nrows): + return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())}) + + q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False) + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + assert q.collect().equals(source(None, None, None)) From 246d017669cbeca3570106b4bb52a92f931ea2c1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Thu, 13 Jun 2024 09:33:43 -0500 Subject: [PATCH 04/25] Plumb pylibcudf strings `contains_re` through cudf_polars (#15918) This PR adds cudf-polars code for evaluating the `StringFunction.Contains` expression node. Depends on https://github.com/rapidsai/cudf/pull/15880/ Authors: - https://github.com/brandon-b-miller - Lawrence Mitchell (https://github.com/wence-) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15918 --- python/cudf_polars/cudf_polars/dsl/expr.py | 51 ++++++++++++++++++ python/cudf_polars/tests/test_string.py | 61 ++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 python/cudf_polars/tests/test_string.py diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 298ef5ab070..03c1db68dbd 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -644,13 +644,28 @@ def __init__( self.options = options self.name = name self.children = children + self._validate_input() + + def _validate_input(self): if self.name not in ( pl_expr.StringFunction.Lowercase, pl_expr.StringFunction.Uppercase, pl_expr.StringFunction.EndsWith, pl_expr.StringFunction.StartsWith, + pl_expr.StringFunction.Contains, ): raise NotImplementedError(f"String function {self.name}") + if self.name == pl_expr.StringFunction.Contains: + literal, strict = self.options + if not literal: + if not strict: + raise NotImplementedError( + "f{strict=} is not supported for regex contains" + ) + if not isinstance(self.children[1], Literal): + raise NotImplementedError( + "Regex contains only supports a scalar pattern" + ) def do_evaluate( self, @@ -660,6 +675,26 @@ def do_evaluate( mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" + if self.name == pl_expr.StringFunction.Contains: + child, arg = self.children + column = child.evaluate(df, context=context, mapping=mapping) + + literal, _ = self.options + if literal: + pat = arg.evaluate(df, context=context, mapping=mapping) + pattern = ( + pat.obj_scalar + if pat.is_scalar and pat.obj.size() != column.obj.size() + else pat.obj + ) + return Column(plc.strings.find.contains(column.obj, pattern)) + else: + assert isinstance(arg, Literal) + prog = plc.strings.regex_program.RegexProgram.create( + arg.value.as_py(), + flags=plc.strings.regex_flags.RegexFlags.DEFAULT, + ) + return Column(plc.strings.contains.contains_re(column.obj, prog)) columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -691,6 +726,22 @@ def do_evaluate( ) ) else: + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + if self.name == pl_expr.StringFunction.Lowercase: + (column,) = columns + return Column(plc.strings.case.to_lower(column.obj)) + elif self.name == pl_expr.StringFunction.Uppercase: + (column,) = columns + return Column(plc.strings.case.to_upper(column.obj)) + elif self.name == pl_expr.StringFunction.EndsWith: + column, suffix = columns + return Column(plc.strings.find.ends_with(column.obj, suffix.obj)) + elif self.name == pl_expr.StringFunction.StartsWith: + column, suffix = columns + return Column(plc.strings.find.starts_with(column.obj, suffix.obj)) raise NotImplementedError( f"StringFunction {self.name}" ) # pragma: no cover; handled by init raising diff --git a/python/cudf_polars/tests/test_string.py b/python/cudf_polars/tests/test_string.py new file mode 100644 index 00000000000..f1a080d040f --- /dev/null +++ b/python/cudf_polars/tests/test_string.py @@ -0,0 +1,61 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from functools import partial + +import pytest + +import polars as pl + +from cudf_polars.callback import execute_with_cudf +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture +def ldf(): + return pl.DataFrame( + {"a": ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]} + ).lazy() + + +@pytest.mark.parametrize( + "substr", + [ + "A", + "de", + ".*", + "^a", + "^A", + "[^a-z]", + "[a-z]{3,}", + "^[A-Z]{2,}", + "j|u", + ], +) +def test_contains_regex(ldf, substr): + query = ldf.select(pl.col("a").str.contains(substr)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "literal", ["A", "de", "FGHI", "j", "kLm", "nOPq", "RsT", "uVw"] +) +def test_contains_literal(ldf, literal): + query = ldf.select(pl.col("a").str.contains(pl.lit(literal), literal=True)) + assert_gpu_result_equal(query) + + +def test_contains_column(ldf): + query = ldf.select(pl.col("a").str.contains(pl.col("a"), literal=True)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize("pat", ["["]) +def test_contains_invalid(ldf, pat): + query = ldf.select(pl.col("a").str.contains(pat)) + + with pytest.raises(pl.exceptions.ComputeError): + query.collect() + with pytest.raises(pl.exceptions.ComputeError): + query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)) From f651f12471edda51bf4c4071d74ff6720bd037fc Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 13 Jun 2024 16:05:44 +0100 Subject: [PATCH 05/25] Port start of datetime.hpp to pylibcudf (#15916) Start exposing datetime extraction functions. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15916 --- .../api_docs/pylibcudf/datetime.rst | 6 ++++ .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 4 ++- python/cudf/cudf/_lib/pylibcudf/__init__.py | 4 ++- python/cudf/cudf/_lib/pylibcudf/datetime.pxd | 8 +++++ python/cudf/cudf/_lib/pylibcudf/datetime.pyx | 33 +++++++++++++++++++ .../_lib/pylibcudf/libcudf/CMakeLists.txt | 2 +- python/cudf/cudf/pylibcudf_tests/conftest.py | 5 +++ .../cudf/pylibcudf_tests/test_datetime.py | 30 +++++++++++++++++ .../cudf/cudf/pylibcudf_tests/test_round.py | 9 ++--- 11 files changed, 93 insertions(+), 10 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/datetime.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/datetime.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_datetime.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst new file mode 100644 index 00000000000..ebf5fab3052 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst @@ -0,0 +1,6 @@ +======= +copying +======= + +.. automodule:: cudf._lib.pylibcudf.datetime + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 1e03fa80bb5..f98298ff052 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -14,6 +14,7 @@ This page provides API documentation for pylibcudf. column_factories concatenate copying + datetime filling gpumemoryview groupby diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index ed396208f98..0a198f431a7 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -19,6 +19,7 @@ set(cython_sources column_factories.pyx concatenate.pyx copying.pyx + datetime.pyx filling.pyx gpumemoryview.pyx groupby.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index a628ecdb038..5131df9a5cd 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -7,6 +7,7 @@ from . cimport ( column_factories, concatenate, copying, + datetime, filling, groupby, join, @@ -40,9 +41,10 @@ __all__ = [ "Table", "aggregation", "binaryop", + "column_factories", "concatenate", "copying", - "column_factories", + "datetime", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 46d0fe13cd1..43a9e2aca31 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -6,6 +6,7 @@ column_factories, concatenate, copying, + datetime, filling, groupby, interop, @@ -39,9 +40,10 @@ "TypeId", "aggregation", "binaryop", + "column_factories", "concatenate", "copying", - "column_factories", + "datetime", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/datetime.pxd b/python/cudf/cudf/_lib/pylibcudf/datetime.pxd new file mode 100644 index 00000000000..2fce48cf1b4 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/datetime.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from .column cimport Column + + +cpdef Column extract_year( + Column col +) diff --git a/python/cudf/cudf/_lib/pylibcudf/datetime.pyx b/python/cudf/cudf/_lib/pylibcudf/datetime.pyx new file mode 100644 index 00000000000..82351327de6 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/datetime.pyx @@ -0,0 +1,33 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.datetime cimport ( + extract_year as cpp_extract_year, +) + +from .column cimport Column + + +cpdef Column extract_year( + Column values +): + """ + Extract the year from a datetime column. + + Parameters + ---------- + values : Column + The column to extract the year from. + + Returns + ------- + Column + Column with the extracted years. + """ + cdef unique_ptr[column] result + + with nogil: + result = move(cpp_extract_year(values.view())) + return Column.from_libcudf(move(result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt index ac56d42dda8..6c66d01ca57 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd round.pyx +set(cython_sources aggregation.pyx binaryop.pyx copying.pyx reduce.pyx replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx ) diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index f3c6584ef8c..b169bbdee5b 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -58,3 +58,8 @@ def interp_opt(request): ) def sorted_opt(request): return request.param + + +@pytest.fixture(scope="session", params=[False, True]) +def has_nulls(request): + return request.param diff --git a/python/cudf/cudf/pylibcudf_tests/test_datetime.py b/python/cudf/cudf/pylibcudf_tests/test_datetime.py new file mode 100644 index 00000000000..75af0fa6ca1 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_datetime.py @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import datetime + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture +def column(has_nulls): + values = [ + datetime.date(1999, 1, 1), + datetime.date(2024, 10, 12), + datetime.date(1, 1, 1), + datetime.date(9999, 1, 1), + ] + if has_nulls: + values[2] = None + return plc.interop.from_arrow(pa.array(values, type=pa.date32())) + + +def test_extract_year(column): + got = plc.datetime.extract_year(column) + # libcudf produces an int16, arrow produces an int64 + expect = pa.compute.year(plc.interop.to_arrow(column)).cast(pa.int16()) + + assert_column_eq(expect, got) diff --git a/python/cudf/cudf/pylibcudf_tests/test_round.py b/python/cudf/cudf/pylibcudf_tests/test_round.py index a234860477f..991e6ed310d 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_round.py +++ b/python/cudf/cudf/pylibcudf_tests/test_round.py @@ -7,16 +7,11 @@ import cudf._lib.pylibcudf as plc -@pytest.fixture(params=[False, True]) -def nullable(request): - return request.param - - @pytest.fixture(params=["float32", "float64"]) -def column(request, nullable): +def column(request, has_nulls): values = [2.5, 2.49, 1.6, 8, -1.5, -1.7, -0.5, 0.5] typ = {"float32": pa.float32(), "float64": pa.float64()}[request.param] - if nullable: + if has_nulls: values[2] = None return plc.interop.from_arrow(pa.array(values, type=typ)) From cb564da1204f0da7eaeb8a0e636a0f23c97c314f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 13 Jun 2024 05:11:37 -1000 Subject: [PATCH 06/25] Move some misc Frame methods to appropriate locations (#15963) * Move `Frame._is_sorted` to `MultiIndex._is_sorted` (the only class that uses this method) * Move `_apply_inverse_column` helper function to define `Column.__invert__` Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15963 --- python/cudf/cudf/core/column/column.py | 5 ++ python/cudf/cudf/core/column/numerical.py | 8 +++ python/cudf/cudf/core/frame.py | 61 +---------------------- python/cudf/cudf/core/multiindex.py | 49 +++++++++++++++++- 4 files changed, 62 insertions(+), 61 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 001e8996c19..75fc31ddbce 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1118,6 +1118,11 @@ def __cuda_array_interface__(self) -> abc.Mapping[str, Any]: def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return _array_ufunc(self, ufunc, method, inputs, kwargs) + def __invert__(self): + raise TypeError( + f"Operation `~` not supported on {self.dtype.type.__name__}" + ) + def searchsorted( self, value, diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 6fb4f17b76d..1952d7eeb71 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -194,6 +194,14 @@ def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase: unaryop = pylibcudf.unary.UnaryOperator[unaryop] return libcudf.unary.unary_operation(self, unaryop) + def __invert__(self): + if self.dtype.kind in "ui": + return self.unary_operator("invert") + elif self.dtype.kind == "b": + return self.unary_operator("not") + else: + return super().__invert__() + def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: int_float_dtype_mapping = { np.int8: np.float32, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index af8886a44a6..01b56f1edc4 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -32,7 +32,7 @@ import cudf from cudf import _lib as libcudf from cudf._typing import Dtype -from cudf.api.types import is_bool_dtype, is_dtype_equal, is_scalar +from cudf.api.types import is_dtype_equal, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -1455,51 +1455,6 @@ def _get_sorted_inds( stable=True, ) - @_cudf_nvtx_annotate - def _is_sorted(self, ascending=None, null_position=None): - """ - Returns a boolean indicating whether the data of the Frame are sorted - based on the parameters given. Does not account for the index. - - Parameters - ---------- - self : Frame - Frame whose columns are to be checked for sort order - ascending : None or list-like of booleans - None or list-like of boolean values indicating expected sort order - of each column. If list-like, size of list-like must be - len(columns). If None, all columns expected sort order is set to - ascending. False (0) - ascending, True (1) - descending. - null_position : None or list-like of booleans - None or list-like of boolean values indicating desired order of - nulls compared to other elements. If list-like, size of list-like - must be len(columns). If None, null order is set to before. False - (0) - before, True (1) - after. - - Returns - ------- - returns : boolean - Returns True, if sorted as expected by ``ascending`` and - ``null_position``, False otherwise. - """ - if ascending is not None and not cudf.api.types.is_list_like( - ascending - ): - raise TypeError( - f"Expected a list-like or None for `ascending`, got " - f"{type(ascending)}" - ) - if null_position is not None and not cudf.api.types.is_list_like( - null_position - ): - raise TypeError( - f"Expected a list-like or None for `null_position`, got " - f"{type(null_position)}" - ) - return libcudf.sort.is_sorted( - [*self._columns], ascending=ascending, null_position=null_position - ) - @_cudf_nvtx_annotate def _split(self, splits): """Split a frame with split points in ``splits``. Returns a list of @@ -1920,7 +1875,7 @@ def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data_like_self( self._data._from_columns_like_self( - (_apply_inverse_column(col) for col in self._data.columns) + (~col for col in self._data.columns) ) ) @@ -1970,15 +1925,3 @@ def __dask_tokenize__(self): str(dict(self._dtypes)), normalize_token(self.to_pandas()), ] - - -def _apply_inverse_column(col: ColumnBase) -> ColumnBase: - """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" - if np.issubdtype(col.dtype, np.integer): - return col.unary_operator("invert") - elif is_bool_dtype(col.dtype): - return col.unary_operator("not") - else: - raise TypeError( - f"Operation `~` not supported on {col.dtype.type.__name__}" - ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 11b4b9154a2..6d3520e33cf 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1636,9 +1636,54 @@ def is_unique(self): def dtype(self): return np.dtype("O") + @_cudf_nvtx_annotate + def _is_sorted(self, ascending=None, null_position=None) -> bool: + """ + Returns a boolean indicating whether the data of the MultiIndex are sorted + based on the parameters given. Does not account for the index. + + Parameters + ---------- + self : MultiIndex + MultiIndex whose columns are to be checked for sort order + ascending : None or list-like of booleans + None or list-like of boolean values indicating expected sort order + of each column. If list-like, size of list-like must be + len(columns). If None, all columns expected sort order is set to + ascending. False (0) - ascending, True (1) - descending. + null_position : None or list-like of booleans + None or list-like of boolean values indicating desired order of + nulls compared to other elements. If list-like, size of list-like + must be len(columns). If None, null order is set to before. False + (0) - before, True (1) - after. + + Returns + ------- + returns : boolean + Returns True, if sorted as expected by ``ascending`` and + ``null_position``, False otherwise. + """ + if ascending is not None and not cudf.api.types.is_list_like( + ascending + ): + raise TypeError( + f"Expected a list-like or None for `ascending`, got " + f"{type(ascending)}" + ) + if null_position is not None and not cudf.api.types.is_list_like( + null_position + ): + raise TypeError( + f"Expected a list-like or None for `null_position`, got " + f"{type(null_position)}" + ) + return libcudf.sort.is_sorted( + [*self._columns], ascending=ascending, null_position=null_position + ) + @cached_property # type: ignore @_cudf_nvtx_annotate - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: """ Return if the index is monotonic increasing (only equal or increasing) values. @@ -1647,7 +1692,7 @@ def is_monotonic_increasing(self): @cached_property # type: ignore @_cudf_nvtx_annotate - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ Return if the index is monotonic decreasing (only equal or decreasing) values. From 3cb3df3255efaec4a5ebb6cb7606067f753e3554 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 13 Jun 2024 11:54:55 -0500 Subject: [PATCH 07/25] Add ability to enable rmm pool on `cudf.pandas` import (#15628) This PR enables allocating of rmm memory pool on `cudf.pandas` import using the following environment variables: ``` export CUDF_PANDAS_RMM_MODE="pool" ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Mark Harris (https://github.com/harrism) - Mads R. B. Kristensen (https://github.com/madsbk) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15628 --- python/cudf/cudf/pandas/__init__.py | 43 +++++++++++++++++++ .../cudf_pandas_tests/test_cudf_pandas.py | 28 ++++++++++++ 2 files changed, 71 insertions(+) diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py index 5b3785531d3..59a88f85dda 100644 --- a/python/cudf/cudf/pandas/__init__.py +++ b/python/cudf/cudf/pandas/__init__.py @@ -2,6 +2,9 @@ # All rights reserved. # SPDX-License-Identifier: Apache-2.0 + +import warnings + from .fast_slow_proxy import is_proxy_object from .magics import load_ipython_extension from .profiler import Profiler @@ -19,6 +22,46 @@ def install(): loader = ModuleAccelerator.install("pandas", "cudf", "pandas") global LOADED LOADED = loader is not None + import os + + if (rmm_mode := os.getenv("CUDF_PANDAS_RMM_MODE", None)) is not None: + import rmm.mr + from rmm.mr import available_device_memory + + # Check if a non-default memory resource is set + current_mr = rmm.mr.get_current_device_resource() + if not isinstance(current_mr, rmm.mr.CudaMemoryResource): + warnings.warn( + f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}", + UserWarning, + ) + free_memory, _ = available_device_memory() + free_memory = int(round(float(free_memory) * 0.80 / 256) * 256) + + if rmm_mode == "cuda": + mr = rmm.mr.CudaMemoryResource() + rmm.mr.set_current_device_resource(mr) + elif rmm_mode == "pool": + rmm.mr.set_current_device_resource( + rmm.mr.PoolMemoryResource( + rmm.mr.get_current_device_resource(), + initial_pool_size=free_memory, + ) + ) + elif rmm_mode == "async": + mr = rmm.mr.CudaAsyncMemoryResource(initial_pool_size=free_memory) + rmm.mr.set_current_device_resource(mr) + elif rmm_mode == "managed": + mr = rmm.mr.ManagedMemoryResource() + rmm.mr.set_current_device_resource(mr) + elif rmm_mode == "managed_pool": + rmm.reinitialize( + managed_memory=True, + pool_allocator=True, + initial_pool_size=free_memory, + ) + else: + raise TypeError(f"Unsupported rmm mode: {rmm_mode}") def pytest_load_initial_conftests(early_config, parser, args): diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 515a4714a5a..c251e4a197e 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -9,6 +9,7 @@ import os import pathlib import pickle +import subprocess import tempfile import types from io import BytesIO, StringIO @@ -1425,6 +1426,33 @@ def test_holidays_within_dates(holiday, start, expected): ) == [utc.localize(dt) for dt in expected] +@pytest.mark.parametrize( + "env_value", + ["", "cuda", "pool", "async", "managed", "managed_pool", "abc"], +) +def test_rmm_option_on_import(env_value): + data_directory = os.path.dirname(os.path.abspath(__file__)) + # Create a copy of the current environment variables + env = os.environ.copy() + env["CUDF_PANDAS_RMM_MODE"] = env_value + + sp_completed = subprocess.run( + [ + "python", + "-m", + "cudf.pandas", + data_directory + "/data/profile_basic.py", + ], + capture_output=True, + text=True, + env=env, + ) + if env_value in {"cuda", "pool", "async", "managed", "managed_pool"}: + assert sp_completed.returncode == 0 + else: + assert sp_completed.returncode == 1 + + def test_cudf_pandas_debugging_different_results(monkeypatch): cudf_mean = cudf.Series.mean From 3f8f2149129f97947223611e2709d235e889389b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 13 Jun 2024 17:04:45 -0500 Subject: [PATCH 08/25] Refactor rmm usage in `cudf.pandas` (#16021) This PR addresses review comments made by @bdice here: https://github.com/rapidsai/cudf/pull/15628#pullrequestreview-2116067037 Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16021 --- python/cudf/cudf/pandas/__init__.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py index 59a88f85dda..ff445a63f74 100644 --- a/python/cudf/cudf/pandas/__init__.py +++ b/python/cudf/cudf/pandas/__init__.py @@ -2,9 +2,11 @@ # All rights reserved. # SPDX-License-Identifier: Apache-2.0 - +import os import warnings +import rmm.mr + from .fast_slow_proxy import is_proxy_object from .magics import load_ipython_extension from .profiler import Profiler @@ -22,12 +24,8 @@ def install(): loader = ModuleAccelerator.install("pandas", "cudf", "pandas") global LOADED LOADED = loader is not None - import os if (rmm_mode := os.getenv("CUDF_PANDAS_RMM_MODE", None)) is not None: - import rmm.mr - from rmm.mr import available_device_memory - # Check if a non-default memory resource is set current_mr = rmm.mr.get_current_device_resource() if not isinstance(current_mr, rmm.mr.CudaMemoryResource): @@ -35,7 +33,7 @@ def install(): f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}", UserWarning, ) - free_memory, _ = available_device_memory() + free_memory, _ = rmm.mr.available_device_memory() free_memory = int(round(float(free_memory) * 0.80 / 256) * 256) if rmm_mode == "cuda": @@ -55,13 +53,13 @@ def install(): mr = rmm.mr.ManagedMemoryResource() rmm.mr.set_current_device_resource(mr) elif rmm_mode == "managed_pool": - rmm.reinitialize( - managed_memory=True, - pool_allocator=True, + mr = rmm.mr.PoolMemoryResource( + rmm.mr.ManagedMemoryResource(), initial_pool_size=free_memory, ) + rmm.mr.set_current_device_resource(mr) else: - raise TypeError(f"Unsupported rmm mode: {rmm_mode}") + raise ValueError(f"Unsupported rmm mode: {rmm_mode}") def pytest_load_initial_conftests(early_config, parser, args): From 31d909b0af9bcf9cf804ca1c3893ea71fbd5d765 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 13 Jun 2024 13:27:05 -1000 Subject: [PATCH 09/25] Support IntervalDtype in cudf.from_pandas (#16014) Noticed while running the pandas test suite against `cudf.pandas` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) - Lawrence Mitchell (https://github.com/wence-) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16014 --- python/cudf/cudf/core/dataframe.py | 6 +++--- python/cudf/cudf/tests/test_interval.py | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e1b6cc45dd3..7438b0237d5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -8072,11 +8072,11 @@ def from_pandas(obj, nan_as_null=no_default): return cudf.Index.from_pandas(obj, nan_as_null=nan_as_null) elif isinstance(obj, pd.CategoricalDtype): return cudf.CategoricalDtype.from_pandas(obj) + elif isinstance(obj, pd.IntervalDtype): + return cudf.IntervalDtype.from_pandas(obj) else: raise TypeError( - "from_pandas only accepts Pandas Dataframes, Series, " - "Index, RangeIndex and MultiIndex objects. " - "Got %s" % type(obj) + f"from_pandas unsupported for object of type {type(obj).__name__}" ) diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index 7b923af1f75..013f4439ad5 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -181,3 +181,10 @@ def test_interval_with_datetime(tz, box): else: with pytest.raises(NotImplementedError): cudf.from_pandas(pobj) + + +def test_from_pandas_intervaldtype(): + dtype = pd.IntervalDtype("int64", closed="left") + result = cudf.from_pandas(dtype) + expected = cudf.IntervalDtype("int64", closed="left") + assert_eq(result, expected) From 987879ca4bdcae0d959266fd39196123007fa45e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 13 Jun 2024 19:27:11 -0700 Subject: [PATCH 10/25] Fix the pool size alignment issue (#16024) This PR fixes a pool size alignment bug. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Mark Harris (https://github.com/harrism) - Vukasin Milovanovic (https://github.com/vuule) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/16024 --- cpp/src/utilities/pinned_memory.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp index 5d2e3ac332a..e90b7969b4d 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/pinned_memory.cpp @@ -43,9 +43,11 @@ class fixed_pinned_pool_memory_resource { public: fixed_pinned_pool_memory_resource(size_t size) - : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)} + : // rmm requires the pool size to be a multiple of 256 bytes + pool_size_{rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT)}, + pool_{new host_pooled_mr(upstream_mr_, pool_size_, pool_size_)} { - if (pool_size_ == 0) { return; } + CUDF_LOG_INFO("Pinned pool size = {}", pool_size_); // Allocate full size from the pinned pool to figure out the beginning and end address pool_begin_ = pool_->allocate_async(pool_size_, stream_); @@ -145,12 +147,8 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr( return std::min(total / 200, size_t{100} * 1024 * 1024); }(); - // rmm requires the pool size to be a multiple of 256 bytes - auto const aligned_size = rmm::align_up(size, rmm::RMM_DEFAULT_HOST_ALIGNMENT); - CUDF_LOG_INFO("Pinned pool size = {}", aligned_size); - // make the pool with max size equal to the initial size - return fixed_pinned_pool_memory_resource{aligned_size}; + return fixed_pinned_pool_memory_resource{size}; }(); static rmm::host_device_async_resource_ref mr_ref{mr}; From 829b3a959cc5f0d41fe51dca9a4335dba0da69a5 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 13 Jun 2024 20:40:56 -0700 Subject: [PATCH 11/25] Fix the int32 overflow when computing page fragment sizes for large string columns (#16028) This PR fixes the possible `int32` overflow when computing page fragment sizes for large (2B+ char) string columns. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/16028 --- cpp/src/io/parquet/writer_impl.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 6d466748c17..ca15b532d07 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1763,10 +1763,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, // for multiple fragments per page to smooth things out. using 2 was too // unbalanced in final page sizes, so using 4 which seems to be a good // compromise at smoothing things out without getting fragment sizes too small. - auto frag_size_fn = [&](auto const& col, size_type col_size) { + auto frag_size_fn = [&](auto const& col, size_t col_size) { int const target_frags_per_page = is_col_fixed_width(col) ? 1 : 4; auto const avg_len = - target_frags_per_page * util::div_rounding_up_safe(col_size, input.num_rows()); + target_frags_per_page * util::div_rounding_up_safe(col_size, input.num_rows()); if (avg_len > 0) { auto const frag_size = util::div_rounding_up_safe(max_page_size_bytes, avg_len); return std::min(max_page_fragment_size, frag_size); From 34227d3cb687d465f1d4a5f12cbb37a47b97866e Mon Sep 17 00:00:00 2001 From: Zach Puller Date: Thu, 13 Jun 2024 23:45:35 -0700 Subject: [PATCH 12/25] orc multithreaded benchmark (#16009) Addresses: https://github.com/rapidsai/cudf/issues/15973 Adds multithreaded benchmarks for the ORC reader. Based off of the parquet equivalent in https://github.com/rapidsai/cudf/pull/15585 ``` # Benchmark Results ## orc_multithreaded_read_decode_mixed ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 338x | 44.348 ms | 1.18% | 44.343 ms | 1.18% | 12107185968 | 939.341 MiB | 39.557 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 80x | 77.634 ms | 0.65% | 77.629 ms | 0.65% | 13831742649 | 1.834 GiB | 79.072 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 341x | 43.921 ms | 1.20% | 43.916 ms | 1.20% | 12224889363 | 825.333 MiB | 39.568 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 80x | 75.418 ms | 0.70% | 75.414 ms | 0.70% | 14237999015 | 1.611 GiB | 79.113 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 80x | 42.682 ms | 1.18% | 42.678 ms | 1.18% | 12579566132 | 883.436 MiB | 39.587 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 9x | 74.056 ms | 0.48% | 74.052 ms | 0.48% | 14499873867 | 1.724 GiB | 79.136 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 25x | 42.198 ms | 0.50% | 42.194 ms | 0.49% | 12723960975 | 940.562 MiB | 39.600 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 8x | 73.933 ms | 0.49% | 73.929 ms | 0.49% | 14524042443 | 1.781 GiB | 79.175 MiB | ## orc_multithreaded_read_decode_fixed_width ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 13x | 40.149 ms | 0.04% | 40.144 ms | 0.04% | 13373482726 | 643.390 MiB | 59.821 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 211x | 71.216 ms | 0.67% | 71.211 ms | 0.67% | 15078297784 | 1.257 GiB | 119.650 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 378x | 39.662 ms | 1.31% | 39.658 ms | 1.31% | 13537590893 | 643.392 MiB | 59.833 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 209x | 71.693 ms | 0.71% | 71.688 ms | 0.71% | 14978085376 | 1.257 GiB | 119.642 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 377x | 39.731 ms | 1.30% | 39.726 ms | 1.30% | 13514305239 | 643.394 MiB | 59.856 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 8x | 70.766 ms | 0.08% | 70.761 ms | 0.08% | 15174115364 | 1.030 GiB | 119.665 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 379x | 39.486 ms | 1.27% | 39.482 ms | 1.27% | 13597888468 | 647.399 MiB | 59.928 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 207x | 72.686 ms | 2.04% | 72.681 ms | 2.04% | 14773317833 | 1.143 GiB | 119.711 MiB | ## orc_multithreaded_read_decode_string ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 80x | 22.933 ms | 2.13% | 22.928 ms | 2.13% | 23415352877 | 661.948 MiB | 10.879 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 160x | 34.167 ms | 1.41% | 34.162 ms | 1.41% | 31430436877 | 1.293 GiB | 21.757 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 560x | 22.533 ms | 2.18% | 22.528 ms | 2.18% | 23830839172 | 609.407 MiB | 10.941 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 80x | 34.311 ms | 1.54% | 34.307 ms | 1.54% | 31298288990 | 1.188 GiB | 21.758 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 23x | 22.179 ms | 0.11% | 22.175 ms | 0.11% | 24211151047 | 624.177 MiB | 10.947 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 15x | 33.793 ms | 0.08% | 33.789 ms | 0.08% | 31777989791 | 1.190 GiB | 21.881 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 679x | 22.006 ms | 1.74% | 22.002 ms | 1.74% | 24401381631 | 624.524 MiB | 10.951 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 160x | 33.320 ms | 1.57% | 33.316 ms | 1.57% | 32229227026 | 1.207 GiB | 21.894 MiB | ## orc_multithreaded_read_decode_list ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|---------|------------|--------|------------|--------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 96x | 74.437 ms | 0.68% | 74.433 ms | 0.68% | 7212831148 | 600.751 MiB | 60.245 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 7x | 80.994 ms | 0.49% | 80.990 ms | 0.49% | 13257745936 | 1.173 GiB | 120.549 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 80x | 79.234 ms | 4.57% | 79.229 ms | 4.57% | 6776190522 | 600.950 MiB | 60.250 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 166x | 90.437 ms | 17.19% | 90.432 ms | 17.19% | 11873413959 | 1.173 GiB | 120.489 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 80x | 78.613 ms | 2.98% | 78.608 ms | 2.98% | 6829702014 | 602.764 MiB | 60.323 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 127x | 118.629 ms | 22.67% | 118.624 ms | 22.67% | 9051644873 | 1.174 GiB | 120.499 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 112x | 133.950 ms | 4.45% | 133.945 ms | 4.45% | 4008135293 | 603.471 MiB | 60.353 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 90x | 167.850 ms | 15.93% | 167.844 ms | 15.93% | 6397248426 | 1.177 GiB | 120.646 MiB | ## orc_multithreaded_read_decode_chunked_mixed ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 671088640 | 671088640 | 333x | 45.009 ms | 1.10% | 45.005 ms | 1.10% | 11929261073 | 939.341 MiB | 39.557 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 671088640 | 671088640 | 96x | 81.524 ms | 0.61% | 81.519 ms | 0.61% | 13171640865 | 1.834 GiB | 79.072 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 671088640 | 671088640 | 339x | 44.183 ms | 0.96% | 44.179 ms | 0.96% | 12152252271 | 825.333 MiB | 39.568 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 671088640 | 671088640 | 7x | 79.051 ms | 0.02% | 79.046 ms | 0.02% | 13583676002 | 1.611 GiB | 79.113 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 671088640 | 671088640 | 12x | 43.276 ms | 0.09% | 43.272 ms | 0.09% | 12407024794 | 883.436 MiB | 39.587 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 671088640 | 671088640 | 19x | 78.019 ms | 0.49% | 78.014 ms | 0.49% | 13763433041 | 1.724 GiB | 79.136 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 671088640 | 671088640 | 80x | 42.803 ms | 1.22% | 42.799 ms | 1.22% | 12543864010 | 911.993 MiB | 39.600 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 671088640 | 671088640 | 193x | 77.856 ms | 0.59% | 77.852 ms | 0.59% | 13792063986 | 1.837 GiB | 79.175 MiB | ## orc_multithreaded_read_decode_chunked_fixed_width ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 671088640 | 671088640 | 112x | 40.497 ms | 1.23% | 40.493 ms | 1.23% | 13258480947 | 643.390 MiB | 59.821 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 671088640 | 671088640 | 7x | 75.440 ms | 0.09% | 75.435 ms | 0.09% | 14234033611 | 1.648 GiB | 119.651 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 671088640 | 671088640 | 80x | 39.793 ms | 1.36% | 39.789 ms | 1.36% | 13493067216 | 643.392 MiB | 59.833 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 671088640 | 671088640 | 69x | 74.499 ms | 0.50% | 74.494 ms | 0.50% | 14413864845 | 1.336 GiB | 119.642 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 671088640 | 671088640 | 381x | 39.273 ms | 1.11% | 39.269 ms | 1.11% | 13671742653 | 643.394 MiB | 59.856 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 671088640 | 671088640 | 204x | 73.755 ms | 0.60% | 73.751 ms | 0.60% | 14559012350 | 1.648 GiB | 119.665 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 671088640 | 671088640 | 80x | 39.490 ms | 1.31% | 39.486 ms | 1.31% | 13596333864 | 631.980 MiB | 59.928 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 671088640 | 671088640 | 203x | 73.907 ms | 1.34% | 73.903 ms | 1.34% | 14529071322 | 1.454 GiB | 119.711 MiB | ## orc_multithreaded_read_decode_chunked_string ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 671088640 | 671088640 | 80x | 23.022 ms | 1.96% | 23.017 ms | 1.96% | 23324556592 | 661.948 MiB | 10.879 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 671088640 | 671088640 | 80x | 37.687 ms | 1.37% | 37.682 ms | 1.37% | 28494755419 | 1.659 GiB | 21.757 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 671088640 | 671088640 | 80x | 22.703 ms | 2.30% | 22.699 ms | 2.30% | 23652118769 | 609.407 MiB | 10.941 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 671088640 | 671088640 | 80x | 37.581 ms | 1.42% | 37.577 ms | 1.42% | 28574723179 | 1.658 GiB | 21.758 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 671088640 | 671088640 | 544x | 22.296 ms | 1.56% | 22.293 ms | 1.56% | 24082840350 | 631.319 MiB | 10.947 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 671088640 | 671088640 | 14x | 36.990 ms | 0.14% | 36.985 ms | 0.14% | 29031484389 | 1.554 GiB | 21.881 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 671088640 | 671088640 | 676x | 22.114 ms | 1.22% | 22.110 ms | 1.22% | 24281965280 | 627.616 MiB | 10.951 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 671088640 | 671088640 | 80x | 37.409 ms | 1.40% | 37.405 ms | 1.40% | 28706077426 | 1.562 GiB | 21.894 MiB | ## orc_multithreaded_read_decode_chunked_list ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|------------|--------|------------|--------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 671088640 | 671088640 | 80x | 74.780 ms | 0.67% | 74.776 ms | 0.67% | 7179747067 | 600.751 MiB | 60.245 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 671088640 | 671088640 | 175x | 86.040 ms | 0.56% | 86.035 ms | 0.56% | 12480222210 | 1.576 GiB | 120.549 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 671088640 | 671088640 | 186x | 80.668 ms | 4.14% | 80.664 ms | 4.14% | 6655685080 | 600.951 MiB | 60.250 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 671088640 | 671088640 | 143x | 105.217 ms | 21.56% | 105.212 ms | 21.56% | 10205531345 | 1.576 GiB | 120.489 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 671088640 | 671088640 | 128x | 80.087 ms | 3.05% | 80.082 ms | 3.05% | 6704042147 | 602.764 MiB | 60.323 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 671088640 | 671088640 | 135x | 111.556 ms | 21.88% | 111.551 ms | 21.88% | 9625546746 | 1.489 GiB | 120.499 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 671088640 | 671088640 | 112x | 134.677 ms | 4.14% | 134.672 ms | 4.14% | 3986513604 | 603.471 MiB | 60.353 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 671088640 | 671088640 | 80x | 178.735 ms | 14.17% | 178.730 ms | 14.17% | 6007630497 | 1.520 GiB | 120.646 MiB | ``` Authors: - Zach Puller (https://github.com/zpuller) - Vukasin Milovanovic (https://github.com/vuule) - MithunR (https://github.com/mythrocks) Approvers: - Yunsong Wang (https://github.com/PointKernel) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/16009 --- cpp/benchmarks/CMakeLists.txt | 5 + .../io/orc/orc_reader_multithreaded.cpp | 335 ++++++++++++++++++ 2 files changed, 340 insertions(+) create mode 100644 cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 49504e53424..8a48126e195 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -267,6 +267,11 @@ ConfigureNVBench(PARQUET_MULTITHREAD_READER_NVBENCH io/parquet/parquet_reader_mu # * orc reader benchmark -------------------------------------------------------------------------- ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp) +# ################################################################################################## +# * orc multithreaded benchmark +# -------------------------------------------------------------------------- +ConfigureNVBench(ORC_MULTITHREADED_NVBENCH io/orc/orc_reader_multithreaded.cpp) + # ################################################################################################## # * csv reader benchmark -------------------------------------------------------------------------- ConfigureNVBench(CSV_READER_NVBENCH io/csv/csv_reader_input.cpp io/csv/csv_reader_options.cpp) diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp new file mode 100644 index 00000000000..ffbbc6f8464 --- /dev/null +++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +size_t get_num_read_threads(nvbench::state const& state) { return state.get_int64("num_threads"); } + +size_t get_read_size(nvbench::state const& state) +{ + auto const num_reads = get_num_read_threads(state); + return state.get_int64("total_data_size") / num_reads; +} + +std::string get_label(std::string const& test_name, nvbench::state const& state) +{ + auto const num_cols = state.get_int64("num_cols"); + size_t const read_size_mb = get_read_size(state) / (1024 * 1024); + return {test_name + ", " + std::to_string(num_cols) + " columns, " + + std::to_string(get_num_read_threads(state)) + " threads " + " (" + + std::to_string(read_size_mb) + " MB each)"}; +} + +std::tuple, size_t, size_t> write_file_data( + nvbench::state& state, std::vector const& d_types) +{ + auto const cardinality = state.get_int64("cardinality"); + auto const run_length = state.get_int64("run_length"); + auto const num_cols = state.get_int64("num_cols"); + size_t const num_files = get_num_read_threads(state); + size_t const per_file_data_size = get_read_size(state); + + std::vector source_sink_vector; + + size_t total_file_size = 0; + + for (size_t i = 0; i < num_files; ++i) { + cuio_source_sink_pair source_sink{io_type::HOST_BUFFER}; + + auto const tbl = create_random_table( + cycle_dtypes(d_types, num_cols), + table_size_bytes{per_file_data_size}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::orc_writer_options const write_opts = + cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view) + .compression(cudf::io::compression_type::SNAPPY); + + cudf::io::write_orc(write_opts); + total_file_size += source_sink.size(); + + source_sink_vector.push_back(std::move(source_sink)); + } + + return {std::move(source_sink_vector), total_file_size, num_files}; +} + +void BM_orc_multithreaded_read_common(nvbench::state& state, + std::vector const& d_types, + std::string const& label) +{ + auto const data_size = state.get_int64("total_data_size"); + auto const num_threads = state.get_int64("num_threads"); + + auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); + cudf::detail::thread_pool threads(num_threads); + + auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + std::vector source_info_vector; + std::transform(source_sink_vector.begin(), + source_sink_vector.end(), + std::back_inserter(source_info_vector), + [](auto& source_sink) { return source_sink.make_source_info(); }); + + auto mem_stats_logger = cudf::memory_stats_logger(); + + { + cudf::scoped_range range{("(read) " + label).c_str()}; + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + auto read_func = [&](int index) { + auto const stream = streams[index % num_threads]; + cudf::io::orc_reader_options read_opts = + cudf::io::orc_reader_options::builder(source_info_vector[index]); + cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource()); + }; + + threads.paused = true; + for (size_t i = 0; i < num_files; ++i) { + threads.submit(read_func, i); + } + timer.start(); + threads.paused = false; + threads.wait_for_tasks(); + cudf::detail::join_streams(streams, cudf::get_default_stream()); + timer.stop(); + }); + } + + auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size"); +} + +void BM_orc_multithreaded_read_mixed(nvbench::state& state) +{ + auto label = get_label("mixed", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_common( + state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label); +} + +void BM_orc_multithreaded_read_fixed_width(nvbench::state& state) +{ + auto label = get_label("fixed width", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_common(state, {cudf::type_id::INT32}, label); +} + +void BM_orc_multithreaded_read_string(nvbench::state& state) +{ + auto label = get_label("string", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_common(state, {cudf::type_id::STRING}, label); +} + +void BM_orc_multithreaded_read_list(nvbench::state& state) +{ + auto label = get_label("list", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_common(state, {cudf::type_id::LIST}, label); +} + +void BM_orc_multithreaded_read_chunked_common(nvbench::state& state, + std::vector const& d_types, + std::string const& label) +{ + size_t const data_size = state.get_int64("total_data_size"); + auto const num_threads = state.get_int64("num_threads"); + size_t const input_limit = state.get_int64("input_limit"); + size_t const output_limit = state.get_int64("output_limit"); + + auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); + cudf::detail::thread_pool threads(num_threads); + auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + std::vector source_info_vector; + std::transform(source_sink_vector.begin(), + source_sink_vector.end(), + std::back_inserter(source_info_vector), + [](auto& source_sink) { return source_sink.make_source_info(); }); + + auto mem_stats_logger = cudf::memory_stats_logger(); + + { + cudf::scoped_range range{("(read) " + label).c_str()}; + std::vector chunks; + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + auto read_func = [&](int index) { + auto const stream = streams[index % num_threads]; + cudf::io::orc_reader_options read_opts = + cudf::io::orc_reader_options::builder(source_info_vector[index]); + // divide chunk limits by number of threads so the number of chunks produced is the + // same for all cases. this seems better than the alternative, which is to keep the + // limits the same. if we do that, as the number of threads goes up, the number of + // chunks goes down - so are actually benchmarking the same thing in that case? + auto reader = cudf::io::chunked_orc_reader( + output_limit / num_threads, input_limit / num_threads, read_opts, stream); + + // read all the chunks + do { + auto table = reader.read_chunk(); + } while (reader.has_next()); + }; + + threads.paused = true; + for (size_t i = 0; i < num_files; ++i) { + threads.submit(read_func, i); + } + timer.start(); + threads.paused = false; + threads.wait_for_tasks(); + cudf::detail::join_streams(streams, cudf::get_default_stream()); + timer.stop(); + }); + } + + auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size"); +} + +void BM_orc_multithreaded_read_chunked_mixed(nvbench::state& state) +{ + auto label = get_label("mixed", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_chunked_common( + state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label); +} + +void BM_orc_multithreaded_read_chunked_fixed_width(nvbench::state& state) +{ + auto label = get_label("fixed width", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::INT32}, label); +} + +void BM_orc_multithreaded_read_chunked_string(nvbench::state& state) +{ + auto label = get_label("string", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::STRING}, label); +} + +void BM_orc_multithreaded_read_chunked_list(nvbench::state& state) +{ + auto label = get_label("list", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::LIST}, label); +} +auto const thread_range = std::vector{1, 2, 4, 8}; +auto const total_data_size = std::vector{512 * 1024 * 1024, 1024 * 1024 * 1024}; + +// mixed data types: fixed width and strings +NVBENCH_BENCH(BM_orc_multithreaded_read_mixed) + .set_name("orc_multithreaded_read_decode_mixed") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}); + +NVBENCH_BENCH(BM_orc_multithreaded_read_fixed_width) + .set_name("orc_multithreaded_read_decode_fixed_width") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}); + +NVBENCH_BENCH(BM_orc_multithreaded_read_string) + .set_name("orc_multithreaded_read_decode_string") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}); + +NVBENCH_BENCH(BM_orc_multithreaded_read_list) + .set_name("orc_multithreaded_read_decode_list") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}); + +// mixed data types: fixed width, strings +NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_mixed) + .set_name("orc_multithreaded_read_decode_chunked_mixed") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}) + .add_int64_axis("input_limit", {640 * 1024 * 1024}) + .add_int64_axis("output_limit", {640 * 1024 * 1024}); + +NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_fixed_width) + .set_name("orc_multithreaded_read_decode_chunked_fixed_width") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}) + .add_int64_axis("input_limit", {640 * 1024 * 1024}) + .add_int64_axis("output_limit", {640 * 1024 * 1024}); + +NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_string) + .set_name("orc_multithreaded_read_decode_chunked_string") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}) + .add_int64_axis("input_limit", {640 * 1024 * 1024}) + .add_int64_axis("output_limit", {640 * 1024 * 1024}); + +NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_list) + .set_name("orc_multithreaded_read_decode_chunked_list") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}) + .add_int64_axis("input_limit", {640 * 1024 * 1024}) + .add_int64_axis("output_limit", {640 * 1024 * 1024}); From 24fe359425b080594b05bab040699a1468483474 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Fri, 14 Jun 2024 09:35:13 -0400 Subject: [PATCH 13/25] Remove CCCL 2.2 patches as we now always use 2.5+ (#15969) Now that https://github.com/rapidsai/rapids-cmake/pull/607 has been merged we can drop support for patching CCCL 2.2 Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) - Paul Taylor (https://github.com/trxcllnt) URL: https://github.com/rapidsai/cudf/pull/15969 --- .../thirdparty/patches/cccl_override.json | 35 -------------- .../patches/revert_pr_211_cccl_2.5.0.diff | 47 ------------------- .../thrust_disable_64bit_dispatching.diff | 38 +++++++-------- ..._disable_64bit_dispatching_cccl_2.5.0.diff | 25 ---------- .../thrust_faster_scan_compile_times.diff | 30 ++++++------ ..._faster_scan_compile_times_cccl_2.5.0.diff | 39 --------------- .../thrust_faster_sort_compile_times.diff | 32 ++++++------- ..._faster_sort_compile_times_cccl_2.5.0.diff | 39 --------------- 8 files changed, 50 insertions(+), 235 deletions(-) delete mode 100644 cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff delete mode 100644 cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff delete mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff delete mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json index 059f713e7a5..e61102dffac 100644 --- a/cpp/cmake/thirdparty/patches/cccl_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -3,60 +3,25 @@ "packages" : { "CCCL" : { "patches" : [ - { - "file" : "cccl/bug_fixes.diff", - "issue" : "CCCL installs header-search.cmake files in nondeterministic order and has a typo in checking target creation that leads to duplicates", - "fixed_in" : "2.3" - }, - { - "file" : "cccl/hide_kernels.diff", - "issue" : "Mark all cub and thrust kernels with hidden visibility [https://github.com/nvidia/cccl/pulls/443]", - "fixed_in" : "2.3" - }, { "file" : "cccl/revert_pr_211.diff", "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.", "fixed_in" : "" }, - { - "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff", - "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.", - "fixed_in" : "" - }, - { - "file": "cccl/kernel_pointer_hiding.diff", - "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]", - "fixed_in": "2.4" - }, { "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", "fixed_in" : "" }, - { - "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff", - "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", - "fixed_in" : "" - }, { "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff", "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", "fixed_in" : "" }, - { - "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff", - "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", - "fixed_in" : "" - }, { "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff", "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", "fixed_in" : "" - }, - { - "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff", - "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", - "fixed_in" : "" } ] } diff --git a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff deleted file mode 100644 index 27ff16744f5..00000000000 --- a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff +++ /dev/null @@ -1,47 +0,0 @@ -diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h -index 046eb83c0..8047c9701 100644 ---- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h -+++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h -@@ -53,41 +53,15 @@ namespace cuda_cub - - namespace __copy - { --template --OutputIt THRUST_RUNTIME_FUNCTION device_to_device( -- execution_policy& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type) --{ -- typedef typename thrust::iterator_traits::value_type InputTy; -- const auto n = thrust::distance(first, last); -- if (n > 0) -- { -- cudaError status; -- status = trivial_copy_device_to_device( -- policy, -- reinterpret_cast(thrust::raw_pointer_cast(&*result)), -- reinterpret_cast(thrust::raw_pointer_cast(&*first)), -- n); -- cuda_cub::throw_on_error(status, "__copy:: D->D: failed"); -- } -- -- return result + n; --} - - template - OutputIt THRUST_RUNTIME_FUNCTION device_to_device( -- execution_policy& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type) -+ execution_policy& policy, InputIt first, InputIt last, OutputIt result) - { - typedef typename thrust::iterator_traits::value_type InputTy; - return cuda_cub::transform(policy, first, last, result, thrust::identity()); - } - --template --OutputIt THRUST_RUNTIME_FUNCTION --device_to_device(execution_policy& policy, InputIt first, InputIt last, OutputIt result) --{ -- return device_to_device( -- policy, first, last, result, typename is_indirectly_trivially_relocatable_to::type()); --} - } // namespace __copy - - } // namespace cuda_cub diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff index d3f1a26781f..6ae1e1c917b 100644 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff +++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff @@ -1,25 +1,25 @@ diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index d0e3f94ec..5c32a9c60 100644 +index 2a3cc4e33..8fb337b26 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -32,8 +32,7 @@ - status = call arguments; \ - } \ - else { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - +@@ -44,8 +44,7 @@ + } \ + else \ + { \ +- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ +- status = call arguments; \ ++ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ + } + /** -@@ -52,9 +51,7 @@ - status = call arguments; \ - } \ - else { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } +@@ -66,9 +65,7 @@ + } \ + else \ + { \ +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ +- status = call arguments; \ ++ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ + } /** * Dispatch between 32-bit and 64-bit index based versions of the same algorithm diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff deleted file mode 100644 index 6ae1e1c917b..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff +++ /dev/null @@ -1,25 +0,0 @@ -diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index 2a3cc4e33..8fb337b26 100644 ---- a/thrust/thrust/system/cuda/detail/dispatch.h -+++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -44,8 +44,7 @@ - } \ - else \ - { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - - /** -@@ -66,9 +65,7 @@ - } \ - else \ - { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - /** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff index a606e21b92d..fee46046194 100644 --- a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff +++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff @@ -1,23 +1,23 @@ diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh -index 84b6ccffd..25a237f93 100644 +index 0606485bb..dbb99ff13 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh -@@ -808,7 +808,7 @@ struct DeviceRadixSortPolicy - - - /// SM60 (GP100) -- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> +@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy + }; + + /// SM60 (GP100) +- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + enum { - enum { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh -index 994adc095..d3e6719a7 100644 +index f39613adb..75bd16ff9 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh -@@ -479,7 +479,7 @@ struct DeviceReducePolicy +@@ -488,7 +488,7 @@ struct DeviceReducePolicy }; - + /// SM60 - struct Policy600 : ChainedPolicy<600, Policy600, Policy350> + struct Policy600 : ChainedPolicy<600, Policy600, Policy600> @@ -25,15 +25,15 @@ index 994adc095..d3e6719a7 100644 static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 16; diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh -index 0ea5c41ad..1bcd8a111 100644 +index 419908c4e..6ab0840e1 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh -@@ -303,7 +303,7 @@ struct DeviceScanPolicy +@@ -339,7 +339,7 @@ struct DeviceScanPolicy /// SM600 struct Policy600 : DefaultTuning - , ChainedPolicy<600, Policy600, Policy520> + , ChainedPolicy<600, Policy600, Policy600> {}; - + /// SM800 diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff deleted file mode 100644 index fee46046194..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff +++ /dev/null @@ -1,39 +0,0 @@ -diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh -index 0606485bb..dbb99ff13 100644 ---- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh -+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh -@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy - }; - - /// SM60 (GP100) -- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - enum - { -diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh -index f39613adb..75bd16ff9 100644 ---- a/cub/cub/device/dispatch/dispatch_reduce.cuh -+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh -@@ -488,7 +488,7 @@ struct DeviceReducePolicy - }; - - /// SM60 -- struct Policy600 : ChainedPolicy<600, Policy600, Policy350> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - static constexpr int threads_per_block = 256; - static constexpr int items_per_thread = 16; -diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh -index 419908c4e..6ab0840e1 100644 ---- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh -+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh -@@ -339,7 +339,7 @@ struct DeviceScanPolicy - /// SM600 - struct Policy600 - : DefaultTuning -- , ChainedPolicy<600, Policy600, Policy520> -+ , ChainedPolicy<600, Policy600, Policy600> - {}; - - /// SM800 diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff index c34b6433d10..cb0cc55f4d2 100644 --- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff +++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff @@ -1,39 +1,39 @@ diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh -index dc07ef6c2..a066c14da 100644 +index eb76ebb0b..c6c529a50 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh -@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, +@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( KeyT key1 = keys_shared[keys1_beg]; KeyT key2 = keys_shared[keys2_beg]; - + -#pragma unroll +#pragma unroll 1 for (int item = 0; item < ITEMS_PER_THREAD; ++item) { - bool p = (keys2_beg < keys2_end) && -@@ -383,7 +383,7 @@ public: + bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); +@@ -376,7 +376,7 @@ public: // KeyT max_key = oob_default; - -- #pragma unroll -+ #pragma unroll 1 + +-#pragma unroll ++#pragma unroll 1 for (int item = 1; item < ITEMS_PER_THREAD; ++item) { if (ITEMS_PER_THREAD * linear_tid + item < valid_items) diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh -index 5d4867896..b42fb5f00 100644 +index 7d9e8622f..da5627306 100644 --- a/cub/cub/thread/thread_sort.cuh +++ b/cub/cub/thread/thread_sort.cuh -@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], +@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE { - constexpr bool KEYS_ONLY = std::is_same::value; - -- #pragma unroll -+ #pragma unroll 1 + constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; + +-#pragma unroll ++#pragma unroll 1 for (int i = 0; i < ITEMS_PER_THREAD; ++i) { -- #pragma unroll -+ #pragma unroll 1 +-#pragma unroll ++#pragma unroll 1 for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) { if (compare_op(keys[j + 1], keys[j])) diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff deleted file mode 100644 index cb0cc55f4d2..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff +++ /dev/null @@ -1,39 +0,0 @@ -diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh -index eb76ebb0b..c6c529a50 100644 ---- a/cub/cub/block/block_merge_sort.cuh -+++ b/cub/cub/block/block_merge_sort.cuh -@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( - KeyT key1 = keys_shared[keys1_beg]; - KeyT key2 = keys_shared[keys2_beg]; - --#pragma unroll -+#pragma unroll 1 - for (int item = 0; item < ITEMS_PER_THREAD; ++item) - { - bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); -@@ -376,7 +376,7 @@ public: - // - KeyT max_key = oob_default; - --#pragma unroll -+#pragma unroll 1 - for (int item = 1; item < ITEMS_PER_THREAD; ++item) - { - if (ITEMS_PER_THREAD * linear_tid + item < valid_items) -diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh -index 7d9e8622f..da5627306 100644 ---- a/cub/cub/thread/thread_sort.cuh -+++ b/cub/cub/thread/thread_sort.cuh -@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE - { - constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; - --#pragma unroll -+#pragma unroll 1 - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - { --#pragma unroll -+#pragma unroll 1 - for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) - { - if (compare_op(keys[j + 1], keys[j])) From 374ee13adaf18503ee671b652f76a3ccb9dc118b Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 14 Jun 2024 15:28:53 +0100 Subject: [PATCH 14/25] Fix exclude regex in pre-commit clang-format hook (#16030) The clang-tidy changes in #15894 introduce a new exclude regex list to the pre-commit clang-format hook. However, it was a single character too long, ending with a |. Consequently, the exclude regex matched the empty string, and hence excluded every C++ file. Fix this, and apply formatting changes to the files that were modified in the interim and were not clang-format compatible. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - David Wendt (https://github.com/davidwendt) - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16030 --- .pre-commit-config.yaml | 2 +- .../io/orc/orc_reader_multithreaded.cpp | 107 +++++++++--------- cpp/tests/interop/from_arrow_test.cpp | 5 +- 3 files changed, 58 insertions(+), 56 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cc08b832e69..f8c4f4b9143 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -60,7 +60,7 @@ repos: (?x)^( ^cpp/src/io/parquet/ipc/Schema_generated.h| ^cpp/src/io/parquet/ipc/Message_generated.h| - ^cpp/include/cudf_test/cxxopts.hpp| + ^cpp/include/cudf_test/cxxopts.hpp ) - repo: https://github.com/sirosen/texthooks rev: 0.6.6 diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp index ffbbc6f8464..aa0ee39a179 100644 --- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp @@ -50,11 +50,11 @@ std::string get_label(std::string const& test_name, nvbench::state const& state) std::tuple, size_t, size_t> write_file_data( nvbench::state& state, std::vector const& d_types) { - auto const cardinality = state.get_int64("cardinality"); - auto const run_length = state.get_int64("run_length"); - auto const num_cols = state.get_int64("num_cols"); - size_t const num_files = get_num_read_threads(state); - size_t const per_file_data_size = get_read_size(state); + auto const cardinality = state.get_int64("cardinality"); + auto const run_length = state.get_int64("run_length"); + auto const num_cols = state.get_int64("num_cols"); + size_t const num_files = get_num_read_threads(state); + size_t const per_file_data_size = get_read_size(state); std::vector source_sink_vector; @@ -86,7 +86,7 @@ void BM_orc_multithreaded_read_common(nvbench::state& state, std::vector const& d_types, std::string const& label) { - auto const data_size = state.get_int64("total_data_size"); + auto const data_size = state.get_int64("total_data_size"); auto const num_threads = state.get_int64("num_threads"); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); @@ -104,24 +104,24 @@ void BM_orc_multithreaded_read_common(nvbench::state& state, { cudf::scoped_range range{("(read) " + label).c_str()}; state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { - auto read_func = [&](int index) { - auto const stream = streams[index % num_threads]; - cudf::io::orc_reader_options read_opts = - cudf::io::orc_reader_options::builder(source_info_vector[index]); - cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource()); - }; - - threads.paused = true; - for (size_t i = 0; i < num_files; ++i) { - threads.submit(read_func, i); - } - timer.start(); - threads.paused = false; - threads.wait_for_tasks(); - cudf::detail::join_streams(streams, cudf::get_default_stream()); - timer.stop(); - }); + [&](nvbench::launch& launch, auto& timer) { + auto read_func = [&](int index) { + auto const stream = streams[index % num_threads]; + cudf::io::orc_reader_options read_opts = + cudf::io::orc_reader_options::builder(source_info_vector[index]); + cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource()); + }; + + threads.paused = true; + for (size_t i = 0; i < num_files; ++i) { + threads.submit(read_func, i); + } + timer.start(); + threads.paused = false; + threads.wait_for_tasks(); + cudf::detail::join_streams(streams, cudf::get_default_stream()); + timer.stop(); + }); } auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); @@ -184,34 +184,35 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state, cudf::scoped_range range{("(read) " + label).c_str()}; std::vector chunks; state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { - auto read_func = [&](int index) { - auto const stream = streams[index % num_threads]; - cudf::io::orc_reader_options read_opts = - cudf::io::orc_reader_options::builder(source_info_vector[index]); - // divide chunk limits by number of threads so the number of chunks produced is the - // same for all cases. this seems better than the alternative, which is to keep the - // limits the same. if we do that, as the number of threads goes up, the number of - // chunks goes down - so are actually benchmarking the same thing in that case? - auto reader = cudf::io::chunked_orc_reader( - output_limit / num_threads, input_limit / num_threads, read_opts, stream); - - // read all the chunks - do { - auto table = reader.read_chunk(); - } while (reader.has_next()); - }; - - threads.paused = true; - for (size_t i = 0; i < num_files; ++i) { - threads.submit(read_func, i); - } - timer.start(); - threads.paused = false; - threads.wait_for_tasks(); - cudf::detail::join_streams(streams, cudf::get_default_stream()); - timer.stop(); - }); + [&](nvbench::launch& launch, auto& timer) { + auto read_func = [&](int index) { + auto const stream = streams[index % num_threads]; + cudf::io::orc_reader_options read_opts = + cudf::io::orc_reader_options::builder(source_info_vector[index]); + // divide chunk limits by number of threads so the number of chunks produced is + // the same for all cases. this seems better than the alternative, which is to + // keep the limits the same. if we do that, as the number of threads goes up, the + // number of chunks goes down - so are actually benchmarking the same thing in + // that case? + auto reader = cudf::io::chunked_orc_reader( + output_limit / num_threads, input_limit / num_threads, read_opts, stream); + + // read all the chunks + do { + auto table = reader.read_chunk(); + } while (reader.has_next()); + }; + + threads.paused = true; + for (size_t i = 0; i < num_files; ++i) { + threads.submit(read_func, i); + } + timer.start(); + threads.paused = false; + threads.wait_for_tasks(); + cudf::detail::join_streams(streams, cudf::get_default_stream()); + timer.stop(); + }); } auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); @@ -249,7 +250,7 @@ void BM_orc_multithreaded_read_chunked_list(nvbench::state& state) cudf::scoped_range range{label.c_str()}; BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::LIST}, label); } -auto const thread_range = std::vector{1, 2, 4, 8}; +auto const thread_range = std::vector{1, 2, 4, 8}; auto const total_data_size = std::vector{512 * 1024 * 1024, 1024 * 1024 * 1024}; // mixed data types: fixed width and strings diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index af20a5c772f..6eaa1a07e08 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -50,7 +50,8 @@ std::unique_ptr get_cudf_table() {true, false, true, true, true}); columns.emplace_back(std::move(cudf::dictionary::encode(col4))); columns.emplace_back(cudf::test::fixed_width_column_wrapper( - {true, false, true, false, true}, {true, false, true, true, false}).release()); + {true, false, true, false, true}, {true, false, true, true, false}) + .release()); columns.emplace_back(cudf::test::strings_column_wrapper( { "", @@ -338,7 +339,7 @@ TEST_F(FromArrowTest, ChunkedArray) std::vector>{dict_array1, dict_array2}); auto boolean_array = get_arrow_array({true, false, true, false, true}, {true, false, true, true, false}); - auto boolean_chunked_array = std::make_shared(boolean_array); + auto boolean_chunked_array = std::make_shared(boolean_array); auto large_string_chunked_array = std::make_shared( std::vector>{large_string_array_1}); From 2297f9a61e2f4153ab2e8a0631f7cfe7971ead14 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 14 Jun 2024 17:43:17 +0100 Subject: [PATCH 15/25] Fix initialization error in to_arrow for empty string views (#16033) When converting an empty string view to arrow, we don't bother with copies from device, but rather create the arrow arrays directly. The offset buffer is therefore a singleton int32 array with zero in it. Previously, the initialization of this array was incorrect, since mutable_data() returns a uint8_t pointer, and so setting the single element could leave 24 of the 32 bits uninitialized. Fix this by using memset instead to zero out the full buffer. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16033 --- cpp/src/interop/to_arrow.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 47aee982c32..2b3aa2f08f1 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -292,9 +292,9 @@ std::shared_ptr dispatch_to_arrow::operator()( auto child_arrays = fetch_child_array(input_view, {{}, {}}, ar_mr, stream); if (child_arrays.empty()) { // Empty string will have only one value in offset of 4 bytes - auto tmp_offset_buffer = allocate_arrow_buffer(4, ar_mr); - auto tmp_data_buffer = allocate_arrow_buffer(0, ar_mr); - tmp_offset_buffer->mutable_data()[0] = 0; + auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr); + auto tmp_data_buffer = allocate_arrow_buffer(0, ar_mr); + memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t)); return std::make_shared( 0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer)); From 5facc8cde15cc8301adb0c06fc682f558828fbc8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jun 2024 07:12:09 -1000 Subject: [PATCH 16/25] Enable ruff TCH: typing imports under if TYPE_CHECKING (#16015) Reduces some unnecessary imports for running cudf and nicely delineates which imports are meant for typing purposes Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16015 --- docs/cudf/source/conf.py | 6 ++++ pyproject.toml | 2 +- python/cudf/cudf/_typing.py | 3 +- python/cudf/cudf/core/_base_index.py | 9 ++++-- python/cudf/cudf/core/buffer/spill_manager.py | 6 ++-- python/cudf/cudf/core/column/categorical.py | 20 +++++++++---- python/cudf/cudf/core/column/column.py | 8 +++-- python/cudf/cudf/core/column/datetime.py | 16 +++++----- python/cudf/cudf/core/column/decimal.py | 6 ++-- python/cudf/cudf/core/column/lists.py | 6 ++-- python/cudf/cudf/core/column/numerical.py | 29 +++++++++++++------ .../cudf/cudf/core/column/numerical_base.py | 6 ++-- python/cudf/cudf/core/column/string.py | 9 +++--- python/cudf/cudf/core/column/struct.py | 5 +++- python/cudf/cudf/core/column/timedelta.py | 6 ++-- python/cudf/cudf/core/dataframe.py | 5 +++- python/cudf/cudf/core/dtypes.py | 6 ++-- python/cudf/cudf/core/frame.py | 10 +++++-- python/cudf/cudf/core/index.py | 5 +++- python/cudf/cudf/core/indexed_frame.py | 15 ++++++---- python/cudf/cudf/core/multiindex.py | 9 ++++-- python/cudf/cudf/core/series.py | 15 ++++++---- python/cudf/cudf/core/single_column_frame.py | 13 +++++---- 23 files changed, 143 insertions(+), 72 deletions(-) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index e9c760e288e..108f12bc099 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -554,6 +554,12 @@ def on_missing_reference(app, env, node, contnode): nitpick_ignore = [ ("py:class", "SeriesOrIndex"), ("py:class", "Dtype"), + # The following are erroneously warned due to + # https://github.com/sphinx-doc/sphinx/issues/11225 + ("py:class", "pa.Array"), + ("py:class", "ScalarLike"), + ("py:class", "ParentType"), + ("py:class", "ColumnLike"), # TODO: Remove this when we figure out why typing_extensions doesn't seem # to map types correctly for intersphinx ("py:class", "typing_extensions.Self"), diff --git a/pyproject.toml b/pyproject.toml index d343b237ee7..c602240a0b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ quiet-level = 3 line-length = 79 [tool.ruff.lint] -select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418"] +select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH"] ignore = [ # whitespace before : "E203", diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 206173919e1..34c96cc8cb3 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -5,9 +5,10 @@ import numpy as np from pandas import Period, Timedelta, Timestamp -from pandas.api.extensions import ExtensionDtype if TYPE_CHECKING: + from pandas.api.extensions import ExtensionDtype + import cudf # Backwards compat: mypy >= 0.790 rejects Type[NotImplemented], but diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 5d0f7c4ede4..b29fc475b29 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -4,9 +4,8 @@ import pickle import warnings -from collections.abc import Generator from functools import cached_property -from typing import Any, Literal, Set, Tuple +from typing import TYPE_CHECKING, Any, Literal, Set, Tuple import pandas as pd from typing_extensions import Self @@ -31,12 +30,16 @@ ) from cudf.core.abc import Serializable from cudf.core.column import ColumnBase, column -from cudf.core.column_accessor import ColumnAccessor from cudf.errors import MixedTypeError from cudf.utils import ioutils from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype from cudf.utils.utils import _is_same_name +if TYPE_CHECKING: + from collections.abc import Generator + + from cudf.core.column_accessor import ColumnAccessor + class BaseIndex(Serializable): """Base class for all cudf Index types.""" diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py index cd81149bdb8..7bcf97302aa 100644 --- a/python/cudf/cudf/core/buffer/spill_manager.py +++ b/python/cudf/cudf/core/buffer/spill_manager.py @@ -13,15 +13,17 @@ from contextlib import contextmanager from dataclasses import dataclass from functools import partial -from typing import Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple import rmm.mr -from cudf.core.buffer.spillable_buffer import SpillableBufferOwner from cudf.options import get_option from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.string import format_bytes +if TYPE_CHECKING: + from cudf.core.buffer.spillable_buffer import SpillableBufferOwner + _spill_cudf_nvtx_annotate = partial( _cudf_nvtx_annotate, domain="cudf_python-spill" ) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index de20b2ace1d..97c2ce5cf1f 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -3,21 +3,17 @@ from __future__ import annotations import warnings -from collections import abc from functools import cached_property from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast import numpy as np import pandas as pd import pyarrow as pa -from numba import cuda from typing_extensions import Self import cudf from cudf import _lib as libcudf from cudf._lib.transform import bools_to_mask -from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike -from cudf.core.buffer import Buffer from cudf.core.column import column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype, IntervalDtype @@ -29,7 +25,19 @@ ) if TYPE_CHECKING: - from cudf._typing import SeriesOrIndex, SeriesOrSingleColumnIndex + from collections import abc + + import numba.cuda + + from cudf._typing import ( + ColumnBinaryOperand, + ColumnLike, + Dtype, + ScalarLike, + SeriesOrIndex, + SeriesOrSingleColumnIndex, + ) + from cudf.core.buffer import Buffer from cudf.core.column import ( ColumnBase, DatetimeColumn, @@ -868,7 +876,7 @@ def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": def data_array_view( self, *, mode="write" - ) -> cuda.devicearray.DeviceNDArray: + ) -> numba.cuda.devicearray.DeviceNDArray: return self.codes.data_array_view(mode=mode) def unique(self) -> CategoricalColumn: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 75fc31ddbce..dc937dc0469 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2,13 +2,13 @@ from __future__ import annotations -import builtins import pickle from collections import abc from functools import cached_property from itertools import chain from types import SimpleNamespace from typing import ( + TYPE_CHECKING, Any, Dict, List, @@ -49,7 +49,6 @@ ) from cudf._lib.transform import bools_to_mask from cudf._lib.types import size_type_dtype -from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, @@ -89,6 +88,11 @@ ) from cudf.utils.utils import _array_ufunc, mask_dtype +if TYPE_CHECKING: + import builtins + + from cudf._typing import ColumnLike, Dtype, ScalarLike + if PANDAS_GE_210: NumpyExtensionArray = pd.arrays.NumpyExtensionArray else: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 057169aa7e1..e24d85bfedf 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -19,22 +19,22 @@ from cudf import _lib as libcudf from cudf._lib.labeling import label_bins from cudf._lib.search import search_sorted -from cudf._typing import ( - ColumnBinaryOperand, - DatetimeLikeScalar, - Dtype, - DtypeObj, - ScalarLike, -) from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype from cudf.core._compat import PANDAS_GE_220 -from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils.dtypes import _get_base_dtype from cudf.utils.utils import _all_bools_with_nulls if TYPE_CHECKING: + from cudf._typing import ( + ColumnBinaryOperand, + DatetimeLikeScalar, + Dtype, + DtypeObj, + ScalarLike, + ) + from cudf.core.buffer import Buffer from cudf.core.column.numerical import NumericalColumn if PANDAS_GE_220: diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 3a0f6649e21..9c1bedc9926 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -4,7 +4,7 @@ import warnings from decimal import Decimal -from typing import Any, Optional, Sequence, Union, cast +from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast import cupy as cp import numpy as np @@ -16,7 +16,6 @@ from cudf._lib.strings.convert.convert_fixed_point import ( from_decimal as cpp_from_decimal, ) -from cudf._typing import ColumnBinaryOperand, Dtype from cudf.api.types import is_integer_dtype, is_scalar from cudf.core.buffer import as_buffer from cudf.core.column import ColumnBase @@ -31,6 +30,9 @@ from .numerical_base import NumericalBaseColumn +if TYPE_CHECKING: + from cudf._typing import ColumnBinaryOperand, Dtype + class DecimalBaseColumn(NumericalBaseColumn): """Base column for decimal32, decimal64 or decimal128 columns""" diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 8f8ee46c796..080ba949d62 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -3,7 +3,7 @@ from __future__ import annotations from functools import cached_property -from typing import List, Optional, Sequence, Tuple, Union +from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple, Union import numpy as np import pandas as pd @@ -26,13 +26,15 @@ ) from cudf._lib.strings.convert.convert_lists import format_list_column from cudf._lib.types import size_type_dtype -from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethods, ParentType from cudf.core.dtypes import ListDtype from cudf.core.missing import NA +if TYPE_CHECKING: + from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike + class ListColumn(ColumnBase): dtype: ListDtype diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 1952d7eeb71..6af67e02bb4 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -3,7 +3,16 @@ from __future__ import annotations import functools -from typing import Any, Callable, Optional, Sequence, Tuple, Union, cast +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Optional, + Sequence, + Tuple, + Union, + cast, +) import cupy as cp import numpy as np @@ -14,13 +23,6 @@ from cudf import _lib as libcudf from cudf._lib import pylibcudf from cudf._lib.types import size_type_dtype -from cudf._typing import ( - ColumnBinaryOperand, - ColumnLike, - Dtype, - DtypeObj, - ScalarLike, -) from cudf.api.types import ( is_bool_dtype, is_float_dtype, @@ -28,7 +30,6 @@ is_integer_dtype, is_scalar, ) -from cudf.core.buffer import Buffer from cudf.core.column import ( ColumnBase, as_column, @@ -48,6 +49,16 @@ from .numerical_base import NumericalBaseColumn +if TYPE_CHECKING: + from cudf._typing import ( + ColumnBinaryOperand, + ColumnLike, + Dtype, + DtypeObj, + ScalarLike, + ) + from cudf.core.buffer import Buffer + _unaryop_map = { "ASIN": "ARCSIN", "ACOS": "ARCCOS", diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index d38ec9cf30f..bd48054a951 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -3,17 +3,19 @@ from __future__ import annotations -from typing import Optional, cast +from typing import TYPE_CHECKING, Optional, cast import numpy as np import cudf from cudf import _lib as libcudf -from cudf._typing import ScalarLike from cudf.core.column import ColumnBase from cudf.core.missing import NA from cudf.core.mixins import Scannable +if TYPE_CHECKING: + from cudf._typing import ScalarLike + class NumericalBaseColumn(ColumnBase, Scannable): """A column composed of numerical data. diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index ad7dbe5e52e..87df2d2f1f1 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -16,11 +16,9 @@ overload, ) -import cupy import numpy as np import pandas as pd import pyarrow as pa -from numba import cuda from typing_extensions import Self import cudf @@ -30,7 +28,6 @@ from cudf._lib.column import Column from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype -from cudf.core.buffer import Buffer from cudf.core.column import column, datetime from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods @@ -46,6 +43,9 @@ def str_to_boolean(column: StringColumn): if TYPE_CHECKING: + import cupy + import numba.cuda + from cudf._typing import ( ColumnBinaryOperand, ColumnLike, @@ -53,6 +53,7 @@ def str_to_boolean(column: StringColumn): ScalarLike, SeriesOrIndex, ) + from cudf.core.buffer import Buffer _str_to_numeric_typecast_functions = { @@ -5598,7 +5599,7 @@ def any(self, skipna: bool = True) -> bool: def data_array_view( self, *, mode="write" - ) -> cuda.devicearray.DeviceNDArray: + ) -> numba.cuda.devicearray.DeviceNDArray: raise ValueError("Cannot get an array view of a StringColumn") @property diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 6dd35570b95..c2ce787eeae 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -2,17 +2,20 @@ from __future__ import annotations from functools import cached_property +from typing import TYPE_CHECKING import pandas as pd import pyarrow as pa import cudf -from cudf._typing import Dtype from cudf.core.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import StructDtype from cudf.core.missing import NA +if TYPE_CHECKING: + from cudf._typing import Dtype + class StructColumn(ColumnBase): """ diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index c6af052b56f..0af847f38af 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -4,7 +4,7 @@ import datetime import functools -from typing import Any, Optional, Sequence, cast +from typing import TYPE_CHECKING, Any, Optional, Sequence, cast import numpy as np import pandas as pd @@ -13,13 +13,15 @@ import cudf from cudf import _lib as libcudf -from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype from cudf.api.types import is_scalar, is_timedelta64_dtype from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import np_to_pa_dtype from cudf.utils.utils import _all_bools_with_nulls +if TYPE_CHECKING: + from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype + _unit_to_nanoseconds_conversion = { "ns": 1, "us": 1_000, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7438b0237d5..70820fa8e00 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -15,6 +15,7 @@ from collections import abc, defaultdict from collections.abc import Iterator from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -41,7 +42,6 @@ import cudf import cudf.core.common from cudf import _lib as libcudf -from cudf._typing import ColumnLike, Dtype, NotImplementedType from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, @@ -99,6 +99,9 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api +if TYPE_CHECKING: + from cudf._typing import ColumnLike, Dtype, NotImplementedType + _cupy_nan_methods_map = { "min": "nanmin", "max": "nanmax", diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 4729233ee6e..b1282040e60 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -6,7 +6,7 @@ import textwrap import warnings from functools import cached_property -from typing import Any, Callable, Dict, List, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type, Union import numpy as np import pandas as pd @@ -19,9 +19,11 @@ from cudf._typing import Dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable -from cudf.core.buffer import Buffer from cudf.utils.docutils import doc_apply +if TYPE_CHECKING: + from cudf.core.buffer import Buffer + def dtype(arbitrary): """ diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 01b56f1edc4..ffaa90ef915 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,10 +6,10 @@ import itertools import operator import pickle -import types import warnings from collections import abc from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -31,7 +31,6 @@ import cudf from cudf import _lib as libcudf -from cudf._typing import Dtype from cudf.api.types import is_dtype_equal, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( @@ -48,6 +47,11 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf +if TYPE_CHECKING: + from types import ModuleType + + from cudf._typing import Dtype + # TODO: It looks like Frame is missing a declaration of `copy`, need to add class Frame(BinaryOperand, Scannable): @@ -410,7 +414,7 @@ def __arrow_array__(self, type=None): def _to_array( self, get_array: Callable, - module: types.ModuleType, + module: ModuleType, copy: bool, dtype: Union[Dtype, None] = None, na_value=None, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 732e5cdb01a..655f7607b37 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -5,10 +5,10 @@ import operator import pickle import warnings -from collections.abc import Generator from functools import cache, cached_property from numbers import Number from typing import ( + TYPE_CHECKING, Any, List, Literal, @@ -71,6 +71,9 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import _warn_no_dask_cudf, search_range +if TYPE_CHECKING: + from collections.abc import Generator + class IndexMeta(type): """Custom metaclass for Index that overrides instance/subclass tests.""" diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fdc78005996..75614fa46c7 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -9,6 +9,7 @@ import warnings from collections import Counter, abc from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -31,12 +32,6 @@ import cudf import cudf._lib as libcudf -from cudf._typing import ( - ColumnLike, - DataFrameOrSeries, - Dtype, - NotImplementedType, -) from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -70,6 +65,14 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import _warn_no_dask_cudf +if TYPE_CHECKING: + from cudf._typing import ( + ColumnLike, + DataFrameOrSeries, + Dtype, + NotImplementedType, + ) + doc_reset_index_template = """ Reset the index of the {klass}, or a level of it. diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 6d3520e33cf..865d9660b1d 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -8,10 +8,9 @@ import pickle import warnings from collections import abc -from collections.abc import Generator from functools import cached_property from numbers import Integral -from typing import Any, List, MutableMapping, Tuple, Union +from typing import TYPE_CHECKING, Any, List, MutableMapping, Tuple, Union import cupy as cp import numpy as np @@ -20,7 +19,6 @@ import cudf import cudf._lib as libcudf from cudf._lib.types import size_type_dtype -from cudf._typing import DataFrameOrSeries from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column @@ -36,6 +34,11 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name +if TYPE_CHECKING: + from collections.abc import Generator + + from cudf._typing import DataFrameOrSeries + def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]: """Makes best effort to convert an array of indices into a python slice. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a52b583d3b4..1b1e82333cf 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -10,6 +10,7 @@ from collections import abc from shutil import get_terminal_size from typing import ( + TYPE_CHECKING, Any, Dict, Literal, @@ -27,12 +28,6 @@ import cudf from cudf import _lib as libcudf -from cudf._typing import ( - ColumnLike, - DataFrameOrSeries, - NotImplementedType, - ScalarLike, -) from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -85,6 +80,14 @@ ) from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate +if TYPE_CHECKING: + from cudf._typing import ( + ColumnLike, + DataFrameOrSeries, + NotImplementedType, + ScalarLike, + ) + def _format_percentile_names(percentiles): return [f"{int(x * 100)}%" for x in percentiles] diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index acc74129a29..6fd4e857e02 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,15 +3,11 @@ from __future__ import annotations -from typing import Any, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union -import cupy -import numpy -import pyarrow as pa from typing_extensions import Self import cudf -from cudf._typing import NotImplementedType, ScalarLike from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, @@ -25,6 +21,13 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import NotIterable +if TYPE_CHECKING: + import cupy + import numpy + import pyarrow as pa + + from cudf._typing import NotImplementedType, ScalarLike + class SingleColumnFrame(Frame, NotIterable): """A one-dimensional frame. From 9225633e83ca09592c5a144c523f46e95c6e9d75 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jun 2024 07:13:00 -1000 Subject: [PATCH 17/25] Avoid redefining Frame._get_columns_by_label in subclasses (#15912) `Frame._get_columns_by_label` was redefined in `Series` and `DataFrame` to handle some special edge cases in `DataFrame.__getitem__` and empty `Series` By making `_from_data_like_self` more consistent in preserving external properties and moving special casing, we can only define `Frame._get_columns_by_label` once Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Charles Blackmon-Luca (https://github.com/charlesbluca) URL: https://github.com/rapidsai/cudf/pull/15912 --- python/cudf/cudf/core/dataframe.py | 36 +++++++------------------- python/cudf/cudf/core/frame.py | 28 +++++++++++--------- python/cudf/cudf/core/indexed_frame.py | 4 +-- python/cudf/cudf/core/series.py | 20 +++++--------- 4 files changed, 34 insertions(+), 54 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 70820fa8e00..80260c7699b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1348,7 +1348,16 @@ def __getitem__(self, arg): 8 8 8 8 """ if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple): - return self._get_columns_by_label(arg, downcast=True) + out = self._get_columns_by_label(arg) + if is_scalar(arg): + nlevels = 1 + elif isinstance(arg, tuple): + nlevels = len(arg) + if self._data.multiindex is False or nlevels == self._data.nlevels: + out = self._constructor_sliced._from_data(out._data) + out.index = self.index + out.name = arg + return out elif isinstance(arg, slice): return self._slice(arg) @@ -1993,31 +2002,6 @@ def _repr_html_(self): def _repr_latex_(self): return self._get_renderable_dataframe().to_pandas()._repr_latex_() - @_cudf_nvtx_annotate - def _get_columns_by_label( - self, labels, *, downcast=False - ) -> Self | Series: - """ - Return columns of dataframe by `labels` - - If downcast is True, try and downcast from a DataFrame to a Series - """ - ca = self._data.select_by_label(labels) - if downcast: - if is_scalar(labels): - nlevels = 1 - elif isinstance(labels, tuple): - nlevels = len(labels) - if self._data.multiindex is False or nlevels == self._data.nlevels: - out = self._constructor_sliced._from_data( - ca, index=self.index, name=labels - ) - return out - out = self.__class__._from_data( - ca, index=self.index, columns=ca.to_pandas_index() - ) - return out - def _make_operands_and_index_for_binop( self, other: Any, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ffaa90ef915..ee310cfcb58 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -136,12 +136,19 @@ def deserialize(cls, header, frames): @classmethod @_cudf_nvtx_annotate def _from_data(cls, data: MutableMapping) -> Self: + """ + Construct cls from a ColumnAccessor-like mapping. + """ obj = cls.__new__(cls) Frame.__init__(obj, data) return obj @_cudf_nvtx_annotate def _from_data_like_self(self, data: MutableMapping) -> Self: + """ + Return type(self) from a ColumnAccessor-like mapping but + with the external properties, e.g. .index, .name, of self. + """ return self._from_data(data) @_cudf_nvtx_annotate @@ -355,12 +362,13 @@ def equals(self, other) -> bool: ) @_cudf_nvtx_annotate - def _get_columns_by_label(self, labels, *, downcast=False) -> Self: + def _get_columns_by_label(self, labels) -> Self: """ - Returns columns of the Frame specified by `labels` + Returns columns of the Frame specified by `labels`. + Akin to cudf.DataFrame(...).loc[:, labels] """ - return self.__class__._from_data(self._data.select_by_label(labels)) + return self._from_data_like_self(self._data.select_by_label(labels)) @property @_cudf_nvtx_annotate @@ -1438,14 +1446,10 @@ def _get_sorted_inds( Get the indices required to sort self according to the columns specified in by. """ - - to_sort = [ - *( - self - if by is None - else self._get_columns_by_label(list(by), downcast=False) - )._columns - ] + if by is None: + to_sort = self._columns + else: + to_sort = self._get_columns_by_label(list(by))._columns if is_scalar(ascending): ascending_lst = [ascending] * len(to_sort) @@ -1453,7 +1457,7 @@ def _get_sorted_inds( ascending_lst = list(ascending) return libcudf.sort.order_by( - to_sort, + list(to_sort), ascending_lst, na_position, stable=True, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 75614fa46c7..3a4f4874e35 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -309,8 +309,8 @@ def _from_data( @_cudf_nvtx_annotate def _from_data_like_self(self, data: MutableMapping): - out = self._from_data(data, self.index) - out._data._level_names = self._data._level_names + out = super()._from_data_like_self(data) + out.index = self.index return out @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 1b1e82333cf..ebf6910ca5f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -685,6 +685,12 @@ def _from_data( out.name = name return out + @_cudf_nvtx_annotate + def _from_data_like_self(self, data: MutableMapping): + out = super()._from_data_like_self(data) + out.name = self.name + return out + @_cudf_nvtx_annotate def __contains__(self, item): return item in self.index @@ -859,20 +865,6 @@ def deserialize(cls, header, frames): return obj - def _get_columns_by_label(self, labels, *, downcast=False) -> Self: - """Return the column specified by `labels` - - For cudf.Series, either the column, or an empty series is returned. - Parameter `downcast` does not have effects. - """ - ca = self._data.select_by_label(labels) - - return ( - self.__class__._from_data(data=ca, index=self.index) - if len(ca) > 0 - else self.__class__(dtype=self.dtype, name=self.name) - ) - @_cudf_nvtx_annotate def drop( self, From 9dc5e8c2836fa2e54831d25b7f051e031bf553b9 Mon Sep 17 00:00:00 2001 From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com> Date: Fri, 14 Jun 2024 13:31:29 -0400 Subject: [PATCH 18/25] Project automation update: skip if not in project (#16035) This PR adds another condition to when we should run the automation work. PRs aren't always in the cuDF Python project so when this is the case we should skip the job rather than attempting to run it and have it throw an error. Authors: - Ben Jarmak (https://github.com/jarmak-nv) Approvers: - James Lamb (https://github.com/jameslamb) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16035 --- .github/workflows/pr_issue_status_automation.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index 837963c3286..8ca971dc28d 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -35,7 +35,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08 - if: github.event.pull_request.state == 'open' + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" @@ -51,7 +51,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08 - if: github.event.pull_request.state == 'open' + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" From f89cc07b50d3f89e7da8f98afb5fe8f9d9cf33c6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 14 Jun 2024 13:22:49 -0500 Subject: [PATCH 19/25] Add `codecov` coverage for `pandas_tests` (#14513) Fixes: #14496 This PR enables code-coverage for `pandas` tests that are run in cudf CI in pandas accelerator mode. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14513 --- ci/cudf_pandas_scripts/run_tests.sh | 11 ++++++++++- python/cudf/cudf_pandas_tests/test_cudf_pandas.py | 3 +++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index 78945d37f22..1c3b99953fb 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -5,6 +5,10 @@ set -eoxu pipefail +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"} +mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}" + # Function to display script usage function display_usage { echo "Usage: $0 [--no-cudf]" @@ -36,4 +40,9 @@ else python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf-pandas-tests] fi -python -m pytest -p cudf.pandas ./python/cudf/cudf_pandas_tests/ +python -m pytest -p cudf.pandas \ + --cov-config=./python/cudf/.coveragerc \ + --cov=cudf \ + --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \ + --cov-report=term \ + ./python/cudf/cudf_pandas_tests/ diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index c251e4a197e..5be4d350c0b 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -464,6 +464,9 @@ def test_options_mode(): assert xpd.options.mode.copy_on_write == pd.options.mode.copy_on_write +# Codecov and Profiler interfere with each-other, +# hence we don't want to run code-cov on this test. +@pytest.mark.no_cover def test_profiler(): pytest.importorskip("cudf") From 2ad502efe5f9c927b5bc0e5a80820b99f6630e1b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jun 2024 10:50:41 -1000 Subject: [PATCH 20/25] Fix nunique for `MultiIndex`, `DataFrame`, and all NA case with `dropna=False` (#15962) Fixes 3 bugs with `nunique` * `MultiIndex.nunique` returning a `dict` instead of an `int` * `.nunique(dropna=False)` with all `NA`s returning 0 instead of 1 * `DataFrame.nunique` preserving column class and type in the resulting `Series.index` Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/15962 --- cpp/src/stream_compaction/distinct_count.cu | 6 +++++- python/cudf/cudf/core/dataframe.py | 8 +++++--- python/cudf/cudf/core/frame.py | 7 +++---- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/multiindex.py | 5 +++++ python/cudf/cudf/core/single_column_frame.py | 2 -- python/cudf/cudf/tests/test_dataframe.py | 14 ++++++++++++++ python/cudf/cudf/tests/test_multiindex.py | 11 +++++++++++ python/cudf/cudf/tests/test_series.py | 10 ++++++++++ 9 files changed, 54 insertions(+), 11 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index b7aadbe14fa..99ca89cc021 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -187,7 +187,11 @@ cudf::size_type distinct_count(column_view const& input, nan_policy nan_handling, rmm::cuda_stream_view stream) { - if (0 == input.size() or input.null_count() == input.size()) { return 0; } + if (0 == input.size()) { return 0; } + + if (input.null_count() == input.size()) { + return static_cast(null_handling == null_policy::INCLUDE); + } auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 80260c7699b..d8d46a6df73 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7462,7 +7462,7 @@ def __dataframe__( self, nan_as_null=nan_as_null, allow_copy=allow_copy ) - def nunique(self, axis=0, dropna=True): + def nunique(self, axis=0, dropna: bool = True) -> Series: """ Count number of distinct elements in specified axis. Return Series with number of distinct elements. Can ignore NaN values. @@ -7490,8 +7490,10 @@ def nunique(self, axis=0, dropna=True): """ if axis != 0: raise NotImplementedError("axis parameter is not supported yet.") - - return cudf.Series(super().nunique(dropna=dropna)) + counts = [col.distinct_count(dropna=dropna) for col in self._columns] + return self._constructor_sliced( + counts, index=self._data.to_pandas_index() + ) def _sample_axis_1( self, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ee310cfcb58..6a1ef05b1f9 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1903,10 +1903,9 @@ def nunique(self, dropna: bool = True): dict Name and unique value counts of each column in frame. """ - return { - name: col.distinct_count(dropna=dropna) - for name, col in self._data.items() - } + raise NotImplementedError( + f"{type(self).__name__} does not implement nunique" + ) @staticmethod @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 655f7607b37..11d09e470ff 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -898,7 +898,7 @@ def __array__(self, dtype=None): ) @_cudf_nvtx_annotate - def nunique(self) -> int: + def nunique(self, dropna: bool = True) -> int: return len(self) @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 865d9660b1d..91488e06f4e 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1749,6 +1749,11 @@ def fillna(self, value): def unique(self): return self.drop_duplicates(keep="first") + @_cudf_nvtx_annotate + def nunique(self, dropna: bool = True) -> int: + mi = self.dropna(how="all") if dropna else self + return len(mi.unique()) + def _clean_nulls_from_index(self): """ Convert all na values(if any) in MultiIndex object diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 6fd4e857e02..43b5dc76f13 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -338,8 +338,6 @@ def nunique(self, dropna: bool = True) -> int: int Number of unique values in the column. """ - if self._column.null_count == len(self): - return 0 return self._column.distinct_count(dropna=dropna) def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 98e9f9881c7..649821b9b7c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9966,6 +9966,20 @@ def test_dataframe_nunique(data): assert_eq(expected, actual) +@pytest.mark.parametrize( + "columns", + [ + pd.RangeIndex(2, name="foo"), + pd.MultiIndex.from_arrays([[1, 2], [2, 3]], names=["foo", 1]), + pd.Index([3, 5], dtype=np.int8, name="foo"), + ], +) +def test_nunique_preserve_column_in_index(columns): + df = cudf.DataFrame([[1, 2]], columns=columns) + result = df.nunique().index.to_pandas() + assert_eq(result, columns, exact=True) + + @pytest.mark.parametrize( "data", [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index f143112a45f..7b95e4f9a44 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -2162,3 +2162,14 @@ def test_multi_index_contains_hashable(): lfunc_args_and_kwargs=((),), rfunc_args_and_kwargs=((),), ) + + +@pytest.mark.parametrize("array", [[1, 2], [1, None], [None, None]]) +@pytest.mark.parametrize("dropna", [True, False]) +def test_nunique(array, dropna): + arrays = [array, [3, 4]] + gidx = cudf.MultiIndex.from_arrays(arrays) + pidx = pd.MultiIndex.from_arrays(arrays) + result = gidx.nunique(dropna=dropna) + expected = pidx.nunique(dropna=dropna) + assert result == expected diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 30189e1ac8a..52956c230ba 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2851,3 +2851,13 @@ def test_nans_to_nulls_noop_copies_column(value): ser1 = cudf.Series([value]) ser2 = ser1.nans_to_nulls() assert ser1._column is not ser2._column + + +@pytest.mark.parametrize("dropna", [False, True]) +def test_nunique_all_null(dropna): + data = [None, None] + pd_ser = pd.Series(data) + cudf_ser = cudf.Series(data) + result = pd_ser.nunique(dropna=dropna) + expected = cudf_ser.nunique(dropna=dropna) + assert result == expected From 74b382637e69d39df292c59938b5911d9ca3bdf9 Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Fri, 14 Jun 2024 17:01:35 -0500 Subject: [PATCH 21/25] Fix decimal -> float cast in ast code (#16038) Fix decimal -> float cast in ast code that was missed during the earlier code refactoring for making the cast explicit. This closes [issue 16023](https://github.com/rapidsai/cudf/issues/16023) Authors: - Paul Mattione (https://github.com/pmattione-nvidia) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16038 --- cpp/include/cudf/ast/detail/operators.hpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index b618f33a6e5..c483d459833 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -819,7 +820,17 @@ struct operator_functor { template struct cast { static constexpr auto arity{1}; - template + template ()>* = nullptr> + __device__ inline auto operator()(From f) -> To + { + if constexpr (cuda::std::is_floating_point_v) { + return convert_fixed_to_floating(f); + } else { + return static_cast(f); + } + } + + template ()>* = nullptr> __device__ inline auto operator()(From f) -> decltype(static_cast(f)) { return static_cast(f); From e9ebdea49d24f645a6ca5ff6d79e0525a114f5fc Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jun 2024 12:29:54 +0100 Subject: [PATCH 22/25] Delete unused code from stringfunction evaluator (#16032) When introducing the handling of regex contains, we replicated the handlers for some other supported string functions. This means we can delete some code. Additionally, migrate the contains tests to live with the other string function tests, and add coverage of exceptional cases. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/16032 --- python/cudf_polars/cudf_polars/dsl/expr.py | 36 ++----- python/cudf_polars/tests/conftest.py | 10 ++ .../cudf_polars/tests/expressions/test_agg.py | 5 - .../tests/expressions/test_distinct.py | 9 +- .../tests/expressions/test_numeric_binops.py | 5 - .../tests/expressions/test_stringfunction.py | 97 ++++++++++++++++--- python/cudf_polars/tests/test_string.py | 61 ------------ 7 files changed, 102 insertions(+), 121 deletions(-) create mode 100644 python/cudf_polars/tests/conftest.py delete mode 100644 python/cudf_polars/tests/test_string.py diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 03c1db68dbd..0605bba6642 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -688,13 +688,12 @@ def do_evaluate( else pat.obj ) return Column(plc.strings.find.contains(column.obj, pattern)) - else: - assert isinstance(arg, Literal) - prog = plc.strings.regex_program.RegexProgram.create( - arg.value.as_py(), - flags=plc.strings.regex_flags.RegexFlags.DEFAULT, - ) - return Column(plc.strings.contains.contains_re(column.obj, prog)) + assert isinstance(arg, Literal) + prog = plc.strings.regex_program.RegexProgram.create( + arg.value.as_py(), + flags=plc.strings.regex_flags.RegexFlags.DEFAULT, + ) + return Column(plc.strings.contains.contains_re(column.obj, prog)) columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -725,26 +724,9 @@ def do_evaluate( else prefix.obj, ) ) - else: - columns = [ - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ] - if self.name == pl_expr.StringFunction.Lowercase: - (column,) = columns - return Column(plc.strings.case.to_lower(column.obj)) - elif self.name == pl_expr.StringFunction.Uppercase: - (column,) = columns - return Column(plc.strings.case.to_upper(column.obj)) - elif self.name == pl_expr.StringFunction.EndsWith: - column, suffix = columns - return Column(plc.strings.find.ends_with(column.obj, suffix.obj)) - elif self.name == pl_expr.StringFunction.StartsWith: - column, suffix = columns - return Column(plc.strings.find.starts_with(column.obj, suffix.obj)) - raise NotImplementedError( - f"StringFunction {self.name}" - ) # pragma: no cover; handled by init raising + raise NotImplementedError( + f"StringFunction {self.name}" + ) # pragma: no cover; handled by init raising class Sort(Expr): diff --git a/python/cudf_polars/tests/conftest.py b/python/cudf_polars/tests/conftest.py new file mode 100644 index 00000000000..9bbce6bc080 --- /dev/null +++ b/python/cudf_polars/tests/conftest.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + + +@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"], scope="session") +def with_nulls(request): + return request.param diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 79018c80bf3..b044bbb2885 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -20,11 +20,6 @@ def dtype(request): return request.param -@pytest.fixture(params=[False, True], ids=["no-nulls", "with-nulls"]) -def with_nulls(request): - return request.param - - @pytest.fixture( params=[ False, diff --git a/python/cudf_polars/tests/expressions/test_distinct.py b/python/cudf_polars/tests/expressions/test_distinct.py index 22865a7ce22..143dd7e9f0f 100644 --- a/python/cudf_polars/tests/expressions/test_distinct.py +++ b/python/cudf_polars/tests/expressions/test_distinct.py @@ -9,11 +9,6 @@ from cudf_polars.testing.asserts import assert_gpu_result_equal -@pytest.fixture(params=[False, True], ids=["no-nulls", "nulls"]) -def nullable(request): - return request.param - - @pytest.fixture( params=["is_first_distinct", "is_last_distinct", "is_unique", "is_duplicated"] ) @@ -22,9 +17,9 @@ def op(request): @pytest.fixture -def df(nullable): +def df(with_nulls): values: list[int | None] = [1, 2, 3, 1, 1, 7, 3, 2, 7, 8, 1] - if nullable: + if with_nulls: values[1] = None values[4] = None return pl.LazyFrame({"a": values}) diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py index 548aebf0875..7eefc59d927 100644 --- a/python/cudf_polars/tests/expressions/test_numeric_binops.py +++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py @@ -29,11 +29,6 @@ def rtype(request): return request.param -@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"]) -def with_nulls(request): - return request.param - - @pytest.fixture( params=[ pl.Expr.eq, diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 198f35d376b..3c498fe7286 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -2,22 +2,39 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +from functools import partial + import pytest import polars as pl -from cudf_polars import translate_ir +from cudf_polars import execute_with_cudf, translate_ir from cudf_polars.testing.asserts import assert_gpu_result_equal -def test_supported_stringfunction_expression(): - ldf = pl.LazyFrame( - { - "a": ["a", "b", "cdefg", "h", "Wıth ünιcοde"], # noqa: RUF001 - "b": [0, 3, 1, -1, None], - } - ) +@pytest.fixture +def ldf(with_nulls): + a = [ + "AbC", + "de", + "FGHI", + "j", + "kLm", + "nOPq", + "", + "RsT", + "sada", + "uVw", + "h", + "Wıth ünιcοde", # noqa: RUF001 + ] + if with_nulls: + a[4] = None + a[-3] = None + return pl.LazyFrame({"a": a, "b": range(len(a))}) + +def test_supported_stringfunction_expression(ldf): query = ldf.select( pl.col("a").str.starts_with("Z"), pl.col("a").str.ends_with("h").alias("endswith_h"), @@ -27,15 +44,63 @@ def test_supported_stringfunction_expression(): assert_gpu_result_equal(query) -def test_unsupported_stringfunction(): - ldf = pl.LazyFrame( - { - "a": ["a", "b", "cdefg", "h", "Wıth ünιcοde"], # noqa: RUF001 - "b": [0, 3, 1, -1, None], - } - ) - +def test_unsupported_stringfunction(ldf): q = ldf.select(pl.col("a").str.count_matches("e", literal=True)) with pytest.raises(NotImplementedError): _ = translate_ir(q._ldf.visit()) + + +def test_contains_re_non_strict_raises(ldf): + q = ldf.select(pl.col("a").str.contains(".", strict=False)) + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +def test_contains_re_non_literal_raises(ldf): + q = ldf.select(pl.col("a").str.contains(pl.col("b"), literal=False)) + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +@pytest.mark.parametrize( + "substr", + [ + "A", + "de", + ".*", + "^a", + "^A", + "[^a-z]", + "[a-z]{3,}", + "^[A-Z]{2,}", + "j|u", + ], +) +def test_contains_regex(ldf, substr): + query = ldf.select(pl.col("a").str.contains(substr)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "literal", ["A", "de", "FGHI", "j", "kLm", "nOPq", "RsT", "uVw"] +) +def test_contains_literal(ldf, literal): + query = ldf.select(pl.col("a").str.contains(pl.lit(literal), literal=True)) + assert_gpu_result_equal(query) + + +def test_contains_column(ldf): + query = ldf.select(pl.col("a").str.contains(pl.col("a"), literal=True)) + assert_gpu_result_equal(query) + + +def test_contains_invalid(ldf): + query = ldf.select(pl.col("a").str.contains("[")) + + with pytest.raises(pl.exceptions.ComputeError): + query.collect() + with pytest.raises(pl.exceptions.ComputeError): + query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)) diff --git a/python/cudf_polars/tests/test_string.py b/python/cudf_polars/tests/test_string.py deleted file mode 100644 index f1a080d040f..00000000000 --- a/python/cudf_polars/tests/test_string.py +++ /dev/null @@ -1,61 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-License-Identifier: Apache-2.0 -from __future__ import annotations - -from functools import partial - -import pytest - -import polars as pl - -from cudf_polars.callback import execute_with_cudf -from cudf_polars.testing.asserts import assert_gpu_result_equal - - -@pytest.fixture -def ldf(): - return pl.DataFrame( - {"a": ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]} - ).lazy() - - -@pytest.mark.parametrize( - "substr", - [ - "A", - "de", - ".*", - "^a", - "^A", - "[^a-z]", - "[a-z]{3,}", - "^[A-Z]{2,}", - "j|u", - ], -) -def test_contains_regex(ldf, substr): - query = ldf.select(pl.col("a").str.contains(substr)) - assert_gpu_result_equal(query) - - -@pytest.mark.parametrize( - "literal", ["A", "de", "FGHI", "j", "kLm", "nOPq", "RsT", "uVw"] -) -def test_contains_literal(ldf, literal): - query = ldf.select(pl.col("a").str.contains(pl.lit(literal), literal=True)) - assert_gpu_result_equal(query) - - -def test_contains_column(ldf): - query = ldf.select(pl.col("a").str.contains(pl.col("a"), literal=True)) - assert_gpu_result_equal(query) - - -@pytest.mark.parametrize("pat", ["["]) -def test_contains_invalid(ldf, pat): - query = ldf.select(pl.col("a").str.contains(pat)) - - with pytest.raises(pl.exceptions.ComputeError): - query.collect() - with pytest.raises(pl.exceptions.ComputeError): - query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)) From a023d5fd189b52996c00a4b3132171bb3f41a02d Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 17 Jun 2024 09:31:01 -0500 Subject: [PATCH 23/25] Return `FrozenList` for `Index.names` (#16047) Fixes: #16046 This PR returns `FrozenList` for `Index.names` instead of `tuple`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16047 --- python/cudf/cudf/core/_base_index.py | 4 ++-- python/dask_cudf/dask_cudf/io/parquet.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index b29fc475b29..e5945f8860e 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -342,9 +342,9 @@ def deserialize(cls, header, frames): @property def names(self): """ - Returns a tuple containing the name of the Index. + Returns a FrozenList containing the name of the Index. """ - return (self.name,) + return pd.core.indexes.frozen.FrozenList([self.name]) @names.setter def names(self, values): diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index ba8b1e89721..810a804e428 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -316,7 +316,7 @@ def read_partition( if index and (index[0] in df.columns): df = df.set_index(index[0]) - elif index is False and df.index.names != (None,): + elif index is False and df.index.names != [None]: # If index=False, we shouldn't have a named index df.reset_index(inplace=True) From 107753ccaacdb62287c4dd4351e5caf3bf8bc62a Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jun 2024 15:43:13 +0100 Subject: [PATCH 24/25] Remove mapfunction nodes that don't exist/aren't supported (#15991) We can't correctly implemented merge_sorted to match polars because libcudf's implementation is not stable wrt input order. drop_nulls is no longer implemented as a MapFunction, but instead a boolean filter. Finally, add coverage of the mapfunctions we do handle. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Thomas Li (https://github.com/lithomas1) URL: https://github.com/rapidsai/cudf/pull/15991 --- python/cudf_polars/cudf_polars/dsl/ir.py | 56 ++++++-------------- python/cudf_polars/tests/test_mapfunction.py | 43 +++++++++++++++ 2 files changed, 58 insertions(+), 41 deletions(-) create mode 100644 python/cudf_polars/tests/test_mapfunction.py diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 9fb2468e4e9..7f0920e1b57 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -286,13 +286,18 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: pdf = pl.DataFrame._from_pydf(self.df) if self.projection is not None: pdf = pdf.select(self.projection) - # TODO: goes away when libcudf supports large strings table = pdf.to_arrow() schema = table.schema for i, field in enumerate(schema): + # TODO: Nested types if field.type == pa.large_string(): - # TODO: Nested types + # TODO: goes away when libcudf supports large strings schema = schema.set(i, pa.field(field.name, pa.string())) + elif isinstance(field.type, pa.LargeListType): + # TODO: goes away when libcudf supports large lists + schema = schema.set( + i, pa.field(field.name, pa.list_(field.type.field(0))) + ) table = table.cast(schema) df = DataFrame.from_table( plc.interop.from_arrow(table), list(self.schema.keys()) @@ -850,9 +855,11 @@ class MapFunction(IR): _NAMES: ClassVar[frozenset[str]] = frozenset( [ - "drop_nulls", "rechunk", - "merge_sorted", + # libcudf merge is not stable wrt order of inputs, since + # it uses a priority queue to manage the tables it produces. + # See: https://github.com/rapidsai/cudf/issues/16010 + # "merge_sorted", "rename", "explode", ] @@ -869,46 +876,13 @@ def __post_init__(self) -> None: # polars requires that all to-explode columns have the # same sub-shapes raise NotImplementedError("Explode with more than one column") - elif self.name == "merge_sorted": - assert isinstance(self.df, Union) - (key_column,) = self.options - if key_column not in self.df.dfs[0].schema: - raise ValueError(f"Key column {key_column} not found") def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - if self.name == "merge_sorted": - # merge_sorted operates on Union inputs - # but if we evaluate the Union then we can't unpick the - # pieces, so we dive inside and evaluate the pieces by hand - assert isinstance(self.df, Union) - first, *rest = (c.evaluate(cache=cache) for c in self.df.dfs) - (key_column,) = self.options - if not all(first.column_names == r.column_names for r in rest): - raise ValueError("DataFrame shapes/column names don't match") - # Already validated that key_column is in column names - index = first.column_names.index(key_column) - return DataFrame.from_table( - plc.merge.merge_sorted( - [first.table, *(df.table for df in rest)], - [index], - [plc.types.Order.ASCENDING], - [plc.types.NullOrder.BEFORE], - ), - first.column_names, - ).sorted_like(first, subset={key_column}) - elif self.name == "rechunk": + if self.name == "rechunk": # No-op in our data model - return self.df.evaluate(cache=cache) - elif self.name == "drop_nulls": - df = self.df.evaluate(cache=cache) - (subset,) = self.options - subset = set(subset) - indices = [i for i, name in enumerate(df.column_names) if name in subset] - return DataFrame.from_table( - plc.stream_compaction.drop_nulls(df.table, indices, len(indices)), - df.column_names, - ).sorted_like(df) + # Don't think this appears in a plan tree from python + return self.df.evaluate(cache=cache) # pragma: no cover elif self.name == "rename": df = self.df.evaluate(cache=cache) # final tag is "swapping" which is useful for the @@ -924,7 +898,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.lists.explode_outer(df.table, index), df.column_names ).sorted_like(df, subset=subset) else: - raise AssertionError("Should never be reached") + raise AssertionError("Should never be reached") # pragma: no cover @dataclasses.dataclass(slots=True) diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py new file mode 100644 index 00000000000..ec6b3f3fc0a --- /dev/null +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars import translate_ir +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_merge_sorted_raises(): + df1 = pl.LazyFrame({"a": [1, 6, 9], "b": [1, -10, 4]}) + df2 = pl.LazyFrame({"a": [-1, 5, 11, 20], "b": [2, 7, -4, None]}) + df3 = pl.LazyFrame({"a": [-10, 20, 21], "b": [1, 2, 3]}) + + q = df1.merge_sorted(df2, key="a").merge_sorted(df3, key="a") + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +def test_explode_multiple_raises(): + df = pl.LazyFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]}) + q = df.explode("a", "b") + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +@pytest.mark.parametrize("column", ["a", "b"]) +def test_explode_single(column): + df = pl.LazyFrame( + { + "a": [[1, 2], [3, 4], None], + "b": [[5, 6], [7, 8], [9, 10]], + "c": [None, 11, 12], + } + ) + q = df.explode(column) + + assert_gpu_result_equal(q) From 87f6a7e15bb7d8dc0d8733392567fb647074b2fd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 17 Jun 2024 06:21:10 -1000 Subject: [PATCH 25/25] Add ruff rules to avoid importing from typing (#16040) Enabled the following ruff rules to update typing annotations according to PEP585 and PEP604 https://docs.astral.sh/ruff/rules/future-rewritable-type-annotation/ https://docs.astral.sh/ruff/rules/non-pep604-annotation/ https://docs.astral.sh/ruff/rules/non-pep585-annotation/ The changes were made by running `pre-commit run ruff --all-files` with `fix = True` and `unsafe-fixes = True` locally Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Mike Sarahan (https://github.com/msarahan) URL: https://github.com/rapidsai/cudf/pull/16040 --- pyproject.toml | 2 +- python/cudf/cudf/_lib/column.pyi | 46 ++++---- python/cudf/cudf/api/types.py | 4 +- python/cudf/cudf/core/_base_index.py | 6 +- .../cudf/cudf/core/_internals/expressions.py | 12 +- python/cudf/cudf/core/_internals/timezones.py | 19 +-- python/cudf/cudf/core/_internals/where.py | 15 ++- python/cudf/cudf/core/buffer/buffer.py | 14 +-- .../core/buffer/exposure_tracked_buffer.py | 4 +- python/cudf/cudf/core/buffer/spill_manager.py | 20 ++-- .../cudf/cudf/core/buffer/spillable_buffer.py | 18 +-- python/cudf/cudf/core/buffer/utils.py | 20 ++-- python/cudf/cudf/core/column/categorical.py | 46 ++++---- python/cudf/cudf/core/column/column.py | 94 +++++++-------- python/cudf/cudf/core/column/datetime.py | 22 ++-- python/cudf/cudf/core/column/decimal.py | 8 +- python/cudf/cudf/core/column/lists.py | 10 +- python/cudf/cudf/core/column/methods.py | 4 +- python/cudf/cudf/core/column/numerical.py | 35 +++--- .../cudf/cudf/core/column/numerical_base.py | 16 +-- python/cudf/cudf/core/column/string.py | 109 ++++++++---------- python/cudf/cudf/core/column/timedelta.py | 18 +-- python/cudf/cudf/core/column_accessor.py | 25 ++-- python/cudf/cudf/core/dataframe.py | 51 +++----- python/cudf/cudf/core/df_protocol.py | 44 +++---- python/cudf/cudf/core/dtypes.py | 27 ++--- python/cudf/cudf/core/frame.py | 49 +++----- python/cudf/cudf/core/groupby/groupby.py | 19 +-- python/cudf/cudf/core/index.py | 28 ++--- python/cudf/cudf/core/indexed_frame.py | 58 ++++------ python/cudf/cudf/core/indexing_utils.py | 8 +- python/cudf/cudf/core/join/_join_helpers.py | 6 +- python/cudf/cudf/core/join/join.py | 6 +- python/cudf/cudf/core/mixins/binops.pyi | 6 +- python/cudf/cudf/core/mixins/reductions.pyi | 4 +- python/cudf/cudf/core/mixins/scans.pyi | 4 +- python/cudf/cudf/core/multiindex.py | 18 +-- python/cudf/cudf/core/reshape.py | 15 ++- python/cudf/cudf/core/series.py | 30 ++--- python/cudf/cudf/core/single_column_frame.py | 12 +- python/cudf/cudf/core/subword_tokenizer.py | 3 +- python/cudf/cudf/core/tools/datetimes.py | 13 ++- python/cudf/cudf/core/udf/groupby_typing.py | 8 +- python/cudf/cudf/core/udf/utils.py | 5 +- python/cudf/cudf/io/parquet.py | 18 +-- python/cudf/cudf/options.py | 11 +- python/cudf/cudf/pandas/fast_slow_proxy.py | 40 +++---- python/cudf/cudf/pandas/module_accelerator.py | 6 +- python/cudf/cudf/pandas/profiler.py | 12 +- .../cudf/cudf/pylibcudf_tests/common/utils.py | 7 +- .../test_avro_reader_fastavro_integration.py | 5 +- python/cudf/cudf/tests/test_df_protocol.py | 5 +- python/cudf/cudf/tests/test_spilling.py | 8 +- python/cudf/cudf/utils/applyutils.py | 5 +- python/cudf/cudf/utils/queryutils.py | 7 +- python/cudf/cudf/utils/utils.py | 4 +- .../cudf_pandas_tests/test_fast_slow_proxy.py | 1 + python/dask_cudf/dask_cudf/groupby.py | 4 +- 58 files changed, 504 insertions(+), 610 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c602240a0b7..2f59864894b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ quiet-level = 3 line-length = 79 [tool.ruff.lint] -select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH"] +select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH", "FA", "UP006", "UP007"] ignore = [ # whitespace before : "E203", diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index c667286fc16..bcab009c102 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -2,8 +2,6 @@ from __future__ import annotations -from typing import Dict, Optional, Tuple - from typing_extensions import Self from cudf._typing import Dtype, DtypeObj, ScalarLike @@ -11,27 +9,27 @@ from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase class Column: - _data: Optional[Buffer] - _mask: Optional[Buffer] - _base_data: Optional[Buffer] - _base_mask: Optional[Buffer] + _data: Buffer | None + _mask: Buffer | None + _base_data: Buffer | None + _base_mask: Buffer | None _dtype: DtypeObj _size: int _offset: int _null_count: int - _children: Tuple[ColumnBase, ...] - _base_children: Tuple[ColumnBase, ...] - _distinct_count: Dict[bool, int] + _children: tuple[ColumnBase, ...] + _base_children: tuple[ColumnBase, ...] + _distinct_count: dict[bool, int] def __init__( self, - data: Optional[Buffer], + data: Buffer | None, size: int, dtype: Dtype, - mask: Optional[Buffer] = None, - offset: Optional[int] = None, - null_count: Optional[int] = None, - children: Tuple[ColumnBase, ...] = (), + mask: Buffer | None = None, + offset: int | None = None, + null_count: int | None = None, + children: tuple[ColumnBase, ...] = (), ) -> None: ... @property def base_size(self) -> int: ... @@ -40,9 +38,9 @@ class Column: @property def size(self) -> int: ... @property - def base_data(self) -> Optional[Buffer]: ... + def base_data(self) -> Buffer | None: ... @property - def data(self) -> Optional[Buffer]: ... + def data(self) -> Buffer | None: ... @property def data_ptr(self) -> int: ... def set_base_data(self, value: Buffer) -> None: ... @@ -50,25 +48,25 @@ class Column: def nullable(self) -> bool: ... def has_nulls(self, include_nan: bool = False) -> bool: ... @property - def base_mask(self) -> Optional[Buffer]: ... + def base_mask(self) -> Buffer | None: ... @property - def mask(self) -> Optional[Buffer]: ... + def mask(self) -> Buffer | None: ... @property def mask_ptr(self) -> int: ... - def set_base_mask(self, value: Optional[Buffer]) -> None: ... - def set_mask(self, value: Optional[Buffer]) -> Self: ... + def set_base_mask(self, value: Buffer | None) -> None: ... + def set_mask(self, value: Buffer | None) -> Self: ... @property def null_count(self) -> int: ... @property def offset(self) -> int: ... @property - def base_children(self) -> Tuple[ColumnBase, ...]: ... + def base_children(self) -> tuple[ColumnBase, ...]: ... @property - def children(self) -> Tuple[ColumnBase, ...]: ... - def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... + def children(self) -> tuple[ColumnBase, ...]: ... + def set_base_children(self, value: tuple[ColumnBase, ...]) -> None: ... def _mimic_inplace( self, other_col: ColumnBase, inplace=False - ) -> Optional[Self]: ... + ) -> Self | None: ... # TODO: The val parameter should be Scalar, not ScalarLike @staticmethod diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 42b1524bd76..d97e9c815b6 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -8,7 +8,7 @@ from collections import abc from functools import wraps from inspect import isclass -from typing import List, Union, cast +from typing import cast import cupy as cp import numpy as np @@ -219,7 +219,7 @@ def wrapped_func(obj): def _union_categoricals( - to_union: List[Union[cudf.Series, cudf.CategoricalIndex]], + to_union: list[cudf.Series | cudf.CategoricalIndex], sort_categories: bool = False, ignore_order: bool = False, ): diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index e5945f8860e..e71e45e410e 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -5,7 +5,7 @@ import pickle import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, Literal, Set, Tuple +from typing import TYPE_CHECKING, Any, Literal import pandas as pd from typing_extensions import Self @@ -44,11 +44,11 @@ class BaseIndex(Serializable): """Base class for all cudf Index types.""" - _accessors: Set[Any] = set() + _accessors: set[Any] = set() _data: ColumnAccessor @property - def _columns(self) -> Tuple[Any, ...]: + def _columns(self) -> tuple[Any, ...]: raise NotImplementedError @cached_property diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py index 5cb9f0363e0..393a68dd844 100644 --- a/python/cudf/cudf/core/_internals/expressions.py +++ b/python/cudf/cudf/core/_internals/expressions.py @@ -1,8 +1,8 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +from __future__ import annotations import ast import functools -from typing import List, Tuple from cudf._lib.expressions import ( ASTOperator, @@ -98,9 +98,9 @@ class libcudfASTVisitor(ast.NodeVisitor): The column names used to map the names in an expression. """ - def __init__(self, col_names: Tuple[str]): - self.stack: List[Expression] = [] - self.nodes: List[Expression] = [] + def __init__(self, col_names: tuple[str]): + self.stack: list[Expression] = [] + self.nodes: list[Expression] = [] self.col_names = col_names @property @@ -218,7 +218,7 @@ def visit_Call(self, node): @functools.lru_cache(256) -def parse_expression(expr: str, col_names: Tuple[str]): +def parse_expression(expr: str, col_names: tuple[str]): visitor = libcudfASTVisitor(col_names) visitor.visit(ast.parse(expr)) return visitor diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index f04cae719c2..269fcf3e37f 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -1,20 +1,23 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from __future__ import annotations import os import zoneinfo from functools import lru_cache -from typing import Literal, Tuple +from typing import TYPE_CHECKING, Literal import numpy as np from cudf._lib.timezone import make_timezone_transition_table from cudf.core.column.column import as_column -from cudf.core.column.datetime import DatetimeColumn -from cudf.core.column.timedelta import TimeDeltaColumn + +if TYPE_CHECKING: + from cudf.core.column.datetime import DatetimeColumn + from cudf.core.column.timedelta import TimeDeltaColumn @lru_cache(maxsize=20) -def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]: +def get_tz_data(zone_name: str) -> tuple[DatetimeColumn, TimeDeltaColumn]: """ Return timezone data (transition times and UTC offsets) for the given IANA time zone. @@ -40,7 +43,7 @@ def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]: def _find_and_read_tzfile_tzpath( zone_name: str, -) -> Tuple[DatetimeColumn, TimeDeltaColumn]: +) -> tuple[DatetimeColumn, TimeDeltaColumn]: for search_path in zoneinfo.TZPATH: if os.path.isfile(os.path.join(search_path, zone_name)): return _read_tzfile_as_columns(search_path, zone_name) @@ -49,7 +52,7 @@ def _find_and_read_tzfile_tzpath( def _find_and_read_tzfile_tzdata( zone_name: str, -) -> Tuple[DatetimeColumn, TimeDeltaColumn]: +) -> tuple[DatetimeColumn, TimeDeltaColumn]: import importlib.resources package_base = "tzdata.zoneinfo" @@ -78,7 +81,7 @@ def _find_and_read_tzfile_tzdata( def _read_tzfile_as_columns( tzdir, zone_name: str -) -> Tuple[DatetimeColumn, TimeDeltaColumn]: +) -> tuple[DatetimeColumn, TimeDeltaColumn]: transition_times_and_offsets = make_timezone_transition_table( tzdir, zone_name ) @@ -92,7 +95,7 @@ def _read_tzfile_as_columns( def check_ambiguous_and_nonexistent( ambiguous: Literal["NaT"], nonexistent: Literal["NaT"] -) -> Tuple[Literal["NaT"], Literal["NaT"]]: +) -> tuple[Literal["NaT"], Literal["NaT"]]: if ambiguous != "NaT": raise NotImplementedError( "Only ambiguous='NaT' is currently supported" diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index ef6b10f66c1..44ce0ddef25 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -1,18 +1,17 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +from __future__ import annotations import warnings -from typing import Tuple, Union +from typing import TYPE_CHECKING import numpy as np import cudf -from cudf._typing import ScalarLike from cudf.api.types import ( _is_non_decimal_numeric_dtype, is_bool_dtype, is_scalar, ) -from cudf.core.column import ColumnBase from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import ( _can_cast, @@ -21,6 +20,10 @@ is_mixed_with_object_dtype, ) +if TYPE_CHECKING: + from cudf._typing import ScalarLike + from cudf.core.column import ColumnBase + def _normalize_categorical(input_col, other): if isinstance(input_col, cudf.core.column.CategoricalColumn): @@ -41,9 +44,9 @@ def _normalize_categorical(input_col, other): def _check_and_cast_columns_with_other( source_col: ColumnBase, - other: Union[ScalarLike, ColumnBase], + other: ScalarLike | ColumnBase, inplace: bool, -) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]: +) -> tuple[ColumnBase, ScalarLike | ColumnBase]: # Returns type-casted `source_col` & `other` based on `inplace`. source_dtype = source_col.dtype if isinstance(source_dtype, CategoricalDtype): diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index bf6f9f1a3c1..80dbbe4c048 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -6,7 +6,7 @@ import pickle import weakref from types import SimpleNamespace -from typing import Any, Dict, Literal, Mapping, Optional, Tuple +from typing import Any, Literal, Mapping import numpy from typing_extensions import Self @@ -42,7 +42,7 @@ def host_memory_allocation(nbytes: int) -> memoryview: def cuda_array_interface_wrapper( ptr: int, size: int, - owner: Optional[object] = None, + owner: object | None = None, readonly=False, typestr="|u1", version=0, @@ -278,7 +278,7 @@ def get_ptr(self, *, mode: Literal["read", "write"]) -> int: return self._ptr def memoryview( - self, *, offset: int = 0, size: Optional[int] = None + self, *, offset: int = 0, size: int | None = None ) -> memoryview: """Read-only access to the buffer through host memory.""" size = self._size if size is None else size @@ -319,7 +319,7 @@ def __init__( *, owner: BufferOwner, offset: int = 0, - size: Optional[int] = None, + size: int | None = None, ) -> None: size = owner.size if size is None else size if size < 0: @@ -414,7 +414,7 @@ def __cuda_array_interface__(self) -> Mapping: "version": 0, } - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: """Serialize the buffer into header and frames. The frames can be a mixture of memoryview, Buffer, and BufferOwner @@ -427,7 +427,7 @@ def serialize(self) -> Tuple[dict, list]: serializable metadata required to reconstruct the object. The second element is a list containing single frame. """ - header: Dict[str, Any] = {} + header: dict[str, Any] = {} header["type-serialized"] = pickle.dumps(type(self)) header["owner-type-serialized"] = pickle.dumps(type(self._owner)) header["frame_count"] = 1 @@ -480,7 +480,7 @@ def __str__(self) -> str: ) -def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]: +def get_ptr_and_size(array_interface: Mapping) -> tuple[int, int]: """Retrieve the pointer and size from an array interface. Raises ValueError if array isn't C-contiguous. diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py index 15f00fc670d..0bd8d6054b3 100644 --- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py +++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Literal, Mapping, Optional +from typing import Literal, Mapping from typing_extensions import Self @@ -27,7 +27,7 @@ def __init__( self, owner: BufferOwner, offset: int = 0, - size: Optional[int] = None, + size: int | None = None, ) -> None: super().__init__(owner=owner, offset=offset, size=size) self.owner._slices.add(self) diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py index 7bcf97302aa..762cd7f9e86 100644 --- a/python/cudf/cudf/core/buffer/spill_manager.py +++ b/python/cudf/cudf/core/buffer/spill_manager.py @@ -13,7 +13,7 @@ from contextlib import contextmanager from dataclasses import dataclass from functools import partial -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING import rmm.mr @@ -39,7 +39,7 @@ def get_traceback() -> str: def get_rmm_memory_resource_stack( mr: rmm.mr.DeviceMemoryResource, -) -> List[rmm.mr.DeviceMemoryResource]: +) -> list[rmm.mr.DeviceMemoryResource]: """Get the RMM resource stack Parameters @@ -99,14 +99,14 @@ class Expose: total_nbytes: int = 0 spilled_nbytes: int = 0 - spill_totals: Dict[Tuple[str, str], Tuple[int, float]] + spill_totals: dict[tuple[str, str], tuple[int, float]] def __init__(self, level) -> None: self.lock = threading.Lock() self.level = level self.spill_totals = defaultdict(lambda: (0, 0)) # Maps each traceback to a Expose - self.exposes: Dict[str, SpillStatistics.Expose] = {} + self.exposes: dict[str, SpillStatistics.Expose] = {} def log_spill(self, src: str, dst: str, nbytes: int, time: float) -> None: """Log a (un-)spilling event @@ -227,7 +227,7 @@ class SpillManager: def __init__( self, *, - device_memory_limit: Optional[int] = None, + device_memory_limit: int | None = None, statistic_level: int = 0, ) -> None: self._lock = threading.Lock() @@ -298,7 +298,7 @@ def add(self, buffer: SpillableBufferOwner) -> None: def buffers( self, order_by_access_time: bool = False - ) -> Tuple[SpillableBufferOwner, ...]: + ) -> tuple[SpillableBufferOwner, ...]: """Get all managed buffers Parameters @@ -347,7 +347,7 @@ def spill_device_memory(self, nbytes: int) -> int: buf.lock.release() return spilled - def spill_to_device_limit(self, device_limit: Optional[int] = None) -> int: + def spill_to_device_limit(self, device_limit: int | None = None) -> int: """Try to spill device memory until device limit Notice, by default this is a no-op. @@ -402,10 +402,10 @@ def __repr__(self) -> str: # - Initialized to None (spilling disabled) # - Initialized to a SpillManager instance (spilling enabled) _global_manager_uninitialized: bool = True -_global_manager: Optional[SpillManager] = None +_global_manager: SpillManager | None = None -def set_global_manager(manager: Optional[SpillManager]) -> None: +def set_global_manager(manager: SpillManager | None) -> None: """Set the global manager, which if None disables spilling""" global _global_manager, _global_manager_uninitialized @@ -419,7 +419,7 @@ def set_global_manager(manager: Optional[SpillManager]) -> None: _global_manager_uninitialized = False -def get_global_manager() -> Optional[SpillManager]: +def get_global_manager() -> SpillManager | None: """Get the global manager or None if spilling is disabled""" global _global_manager_uninitialized if _global_manager_uninitialized: diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 49258fea9ab..eb57a371965 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -7,7 +7,7 @@ import time import weakref from threading import RLock -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple +from typing import TYPE_CHECKING, Any, Literal import numpy from typing_extensions import Self @@ -88,10 +88,10 @@ class SpillableBufferOwner(BufferOwner): lock: RLock _spill_locks: weakref.WeakSet _last_accessed: float - _ptr_desc: Dict[str, Any] + _ptr_desc: dict[str, Any] _manager: SpillManager - def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None: + def _finalize_init(self, ptr_desc: dict[str, Any]) -> None: """Finish initialization of the spillable buffer This implements the common initialization that `from_device_memory` @@ -297,7 +297,7 @@ def get_ptr(self, *, mode: Literal["read", "write"]) -> int: self._last_accessed = time.monotonic() return self._ptr - def memory_info(self) -> Tuple[int, int, str]: + def memory_info(self) -> tuple[int, int, str]: """Get pointer, size, and device type of this buffer. Warning, it is not safe to access the pointer value without @@ -341,7 +341,7 @@ def __cuda_array_interface__(self) -> dict: } def memoryview( - self, *, offset: int = 0, size: Optional[int] = None + self, *, offset: int = 0, size: int | None = None ) -> memoryview: size = self._size if size is None else size with self.lock: @@ -388,11 +388,11 @@ def spillable(self) -> bool: def spill_lock(self, spill_lock: SpillLock) -> None: self._owner.spill_lock(spill_lock=spill_lock) - def memory_info(self) -> Tuple[int, int, str]: + def memory_info(self) -> tuple[int, int, str]: (ptr, _, device_type) = self._owner.memory_info() return (ptr + self._offset, self.nbytes, device_type) - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: """Serialize the Buffer Normally, we would use `[self]` as the frames. This would work but @@ -411,8 +411,8 @@ def serialize(self) -> Tuple[dict, list]: given to `.deserialize()`, otherwise we would have a `Buffer` pointing to memory already owned by an existing `SpillableBufferOwner`. """ - header: Dict[str, Any] = {} - frames: List[Buffer | memoryview] + header: dict[str, Any] = {} + frames: list[Buffer | memoryview] with self._owner.lock: header["type-serialized"] = pickle.dumps(self.__class__) header["owner-type-serialized"] = pickle.dumps(type(self._owner)) diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index 3346d05ed4a..42a1501c914 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -4,7 +4,7 @@ import threading from contextlib import ContextDecorator -from typing import Any, Dict, Optional, Tuple, Type, Union +from typing import Any from cudf.core.buffer.buffer import ( Buffer, @@ -22,7 +22,7 @@ from cudf.options import get_option -def get_buffer_owner(data: Any) -> Optional[BufferOwner]: +def get_buffer_owner(data: Any) -> BufferOwner | None: """Get the owner of `data`, if one exists Search through the stack of data owners in order to find an @@ -47,10 +47,10 @@ def get_buffer_owner(data: Any) -> Optional[BufferOwner]: def as_buffer( - data: Union[int, Any], + data: int | Any, *, - size: Optional[int] = None, - owner: Optional[object] = None, + size: int | None = None, + owner: object | None = None, exposed: bool = False, ) -> Buffer: """Factory function to wrap `data` in a Buffer object. @@ -117,8 +117,8 @@ def as_buffer( ) # Find the buffer types to return based on the current config - owner_class: Type[BufferOwner] - buffer_class: Type[Buffer] + owner_class: type[BufferOwner] + buffer_class: type[Buffer] if get_global_manager() is not None: owner_class = SpillableBufferOwner buffer_class = SpillableBuffer @@ -161,7 +161,7 @@ def as_buffer( return buffer_class(owner=owner, offset=ptr - base_ptr, size=size) -_thread_spill_locks: Dict[int, Tuple[Optional[SpillLock], int]] = {} +_thread_spill_locks: dict[int, tuple[SpillLock | None, int]] = {} def _push_thread_spill_lock() -> None: @@ -193,7 +193,7 @@ class acquire_spill_lock(ContextDecorator): pushing and popping from `_thread_spill_locks` using its thread ID. """ - def __enter__(self) -> Optional[SpillLock]: + def __enter__(self) -> SpillLock | None: _push_thread_spill_lock() return get_spill_lock() @@ -201,7 +201,7 @@ def __exit__(self, *exc): _pop_thread_spill_lock() -def get_spill_lock() -> Union[SpillLock, None]: +def get_spill_lock() -> SpillLock | None: """Return a spill lock within the context of `acquire_spill_lock` or None Returns None, if spilling is disabled. diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 97c2ce5cf1f..f538180805b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -4,7 +4,7 @@ import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast import numpy as np import pandas as pd @@ -139,7 +139,7 @@ def ordered(self) -> bool: """ return self._column.ordered - def as_ordered(self) -> Optional[SeriesOrIndex]: + def as_ordered(self) -> SeriesOrIndex | None: """ Set the Categorical to be ordered. @@ -175,7 +175,7 @@ def as_ordered(self) -> Optional[SeriesOrIndex]: """ return self._return_or_inplace(self._column.as_ordered(ordered=True)) - def as_unordered(self) -> Optional[SeriesOrIndex]: + def as_unordered(self) -> SeriesOrIndex | None: """ Set the Categorical to be unordered. @@ -222,7 +222,7 @@ def as_unordered(self) -> Optional[SeriesOrIndex]: """ return self._return_or_inplace(self._column.as_ordered(ordered=False)) - def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: + def add_categories(self, new_categories: Any) -> SeriesOrIndex | None: """ Add new categories. @@ -294,7 +294,7 @@ def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: def remove_categories( self, removals: Any, - ) -> Optional[SeriesOrIndex]: + ) -> SeriesOrIndex | None: """ Remove the specified categories. @@ -370,7 +370,7 @@ def set_categories( new_categories: Any, ordered: bool = False, rename: bool = False, - ) -> Optional[SeriesOrIndex]: + ) -> SeriesOrIndex | None: """ Set the categories to the specified new_categories. @@ -443,7 +443,7 @@ def reorder_categories( self, new_categories: Any, ordered: bool = False, - ) -> Optional[SeriesOrIndex]: + ) -> SeriesOrIndex | None: """ Reorder categories as specified in new_categories. @@ -521,8 +521,8 @@ class CategoricalColumn(column.ColumnBase): """ dtype: cudf.core.dtypes.CategoricalDtype - _codes: Optional[NumericalColumn] - _children: Tuple[NumericalColumn] + _codes: NumericalColumn | None + _children: tuple[NumericalColumn] _VALID_REDUCTIONS = { "max", "min", @@ -539,11 +539,11 @@ class CategoricalColumn(column.ColumnBase): def __init__( self, dtype: CategoricalDtype, - mask: Optional[Buffer] = None, - size: Optional[int] = None, + mask: Buffer | None = None, + size: int | None = None, offset: int = 0, - null_count: Optional[int] = None, - children: Tuple["column.ColumnBase", ...] = (), + null_count: int | None = None, + children: tuple["column.ColumnBase", ...] = (), ): if size is None: for child in children: @@ -590,23 +590,23 @@ def set_base_data(self, value): def _process_values_for_isin( self, values: Sequence - ) -> Tuple[ColumnBase, ColumnBase]: + ) -> tuple[ColumnBase, ColumnBase]: lhs = self # We need to convert values to same type as self, # hence passing dtype=self.dtype rhs = cudf.core.column.as_column(values, dtype=self.dtype) return lhs, rhs - def set_base_mask(self, value: Optional[Buffer]): + def set_base_mask(self, value: Buffer | None): super().set_base_mask(value) self._codes = None - def set_base_children(self, value: Tuple[ColumnBase, ...]): + def set_base_children(self, value: tuple[ColumnBase, ...]): super().set_base_children(value) self._codes = None @property - def children(self) -> Tuple[NumericalColumn]: + def children(self) -> tuple[NumericalColumn]: if self._children is None: codes_column = self.base_children[0] start = self.offset * codes_column.dtype.itemsize @@ -693,9 +693,7 @@ def _fill( libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) return result - def slice( - self, start: int, stop: int, stride: Optional[int] = None - ) -> Self: + def slice(self, start: int, stop: int, stride: int | None = None) -> Self: codes = self.codes.slice(start, stop, stride) return cast( Self, @@ -714,7 +712,7 @@ def slice( def _reduce( self, op: str, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, *args, **kwargs, @@ -1073,7 +1071,7 @@ def notnull(self) -> ColumnBase: def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: """ Fill null values with *fill_value* @@ -1207,7 +1205,7 @@ def memory_usage(self) -> int: def _mimic_inplace( self, other_col: ColumnBase, inplace: bool = False - ) -> Optional[Self]: + ) -> Self | None: out = super()._mimic_inplace(other_col, inplace=inplace) if inplace and isinstance(other_col, CategoricalColumn): self._codes = other_col._codes @@ -1468,7 +1466,7 @@ def _create_empty_categorical_column( def pandas_categorical_as_column( - categorical: ColumnLike, codes: Optional[ColumnLike] = None + categorical: ColumnLike, codes: ColumnLike | None = None ) -> CategoricalColumn: """Creates a CategoricalColumn from a pandas.Categorical diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index dc937dc0469..c4e715aeb45 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -7,19 +7,7 @@ from functools import cached_property from itertools import chain from types import SimpleNamespace -from typing import ( - TYPE_CHECKING, - Any, - Dict, - List, - Literal, - MutableSequence, - Optional, - Sequence, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast import cupy import numpy as np @@ -394,7 +382,7 @@ def _fill( begin: int, end: int, inplace: bool = False, - ) -> Optional[Self]: + ) -> Self | None: if end <= begin or begin >= self.size: return self if inplace else self.copy() @@ -532,9 +520,7 @@ def element_indexing(self, index: int): raise IndexError("single positional indexer is out-of-bounds") return libcudf.copying.get_element(self, idx).value - def slice( - self, start: int, stop: int, stride: Optional[int] = None - ) -> Self: + def slice(self, start: int, stop: int, stride: int | None = None) -> Self: stride = 1 if stride is None else stride if start < 0: start = start + len(self) @@ -570,7 +556,7 @@ def __setitem__(self, key: Any, value: Any): else as_column(value, dtype=self.dtype) ) - out: Optional[ColumnBase] # If None, no need to perform mimic inplace. + out: ColumnBase | None # If None, no need to perform mimic inplace. if isinstance(key, slice): out = self._scatter_by_slice(key, value_normalized) else: @@ -593,8 +579,8 @@ def _wrap_binop_normalization(self, other): def _scatter_by_slice( self, key: builtins.slice, - value: Union[cudf.core.scalar.Scalar, ColumnBase], - ) -> Optional[Self]: + value: cudf.core.scalar.Scalar | ColumnBase, + ) -> Self | None: """If this function returns None, it's either a no-op (slice is empty), or the inplace replacement is already performed (fill-in-place). """ @@ -630,7 +616,7 @@ def _scatter_by_slice( def _scatter_by_column( self, key: cudf.core.column.NumericalColumn, - value: Union[cudf.core.scalar.Scalar, ColumnBase], + value: cudf.core.scalar.Scalar | ColumnBase, ) -> Self: if is_bool_dtype(key.dtype): # `key` is boolean mask @@ -667,7 +653,7 @@ def _scatter_by_column( ]._with_type_metadata(self.dtype) def _check_scatter_key_length( - self, num_keys: int, value: Union[cudf.core.scalar.Scalar, ColumnBase] + self, num_keys: int, value: cudf.core.scalar.Scalar | ColumnBase ) -> None: """`num_keys` is the number of keys to scatter. Should equal to the number of rows in ``value`` if ``value`` is a column. @@ -682,7 +668,7 @@ def _check_scatter_key_length( def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: """Fill null values with ``value``. @@ -740,7 +726,7 @@ def indices_of( [as_column(range(0, len(self)), dtype=size_type_dtype)], mask )[0] - def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]: + def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]: indices = self.indices_of(value) if n := len(indices): return ( @@ -856,7 +842,7 @@ def isin(self, values: Sequence) -> ColumnBase: def _process_values_for_isin( self, values: Sequence - ) -> Tuple[ColumnBase, ColumnBase]: + ) -> tuple[ColumnBase, ColumnBase]: """ Helper function for `isin` which pre-process `values` based on `self`. """ @@ -868,7 +854,7 @@ def _process_values_for_isin( rhs = rhs.astype(lhs.dtype) return lhs, rhs - def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]: + def _isin_earlystop(self, rhs: ColumnBase) -> ColumnBase | None: """ Helper function for `isin` which determines possibility of early-stopping or not. @@ -1070,7 +1056,7 @@ def as_string_column( def as_decimal_column( self, dtype: Dtype - ) -> Union["cudf.core.column.decimal.DecimalBaseColumn"]: + ) -> "cudf.core.column.decimal.DecimalBaseColumn": raise NotImplementedError def apply_boolean_mask(self, mask) -> ColumnBase: @@ -1154,7 +1140,7 @@ def unique(self) -> ColumnBase: self.dtype ) - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: # data model: # Serialization produces a nested metadata "header" and a flattened @@ -1167,7 +1153,7 @@ def serialize(self) -> Tuple[dict, list]: # cudf native or foreign some special-casing is required here for # serialization. - header: Dict[Any, Any] = {} + header: dict[Any, Any] = {} frames = [] header["type-serialized"] = pickle.dumps(type(self)) try: @@ -1200,7 +1186,7 @@ def serialize(self) -> Tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list) -> ColumnBase: - def unpack(header, frames) -> Tuple[Any, list]: + def unpack(header, frames) -> tuple[Any, list]: count = header["frame_count"] klass = pickle.loads(header["type-serialized"]) obj = klass.deserialize(header, frames[:count]) @@ -1247,13 +1233,13 @@ def nans_to_nulls(self: Self) -> Self: def normalize_binop_value( self, other: ScalarLike - ) -> Union[ColumnBase, ScalarLike]: + ) -> ColumnBase | ScalarLike: raise NotImplementedError def _reduce( self, op: str, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, *args, **kwargs, @@ -1274,8 +1260,8 @@ def _reduce( return preprocessed def _process_for_reduction( - self, skipna: Optional[bool] = None, min_count: int = 0 - ) -> Union[ColumnBase, ScalarLike]: + self, skipna: bool | None = None, min_count: int = 0 + ) -> ColumnBase | ScalarLike: if skipna is None: skipna = True @@ -1315,8 +1301,8 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: def _label_encoding( self, cats: ColumnBase, - dtype: Optional[Dtype] = None, - na_sentinel: Optional[ScalarLike] = None, + dtype: Dtype | None = None, + na_sentinel: ScalarLike | None = None, ): """ Convert each value in `self` into an integer code, with `cats` @@ -1389,9 +1375,9 @@ def _return_sentinel_column(): def column_empty_like( column: ColumnBase, - dtype: Optional[Dtype] = None, + dtype: Dtype | None = None, masked: bool = False, - newsize: Optional[int] = None, + newsize: int | None = None, ) -> ColumnBase: """Allocate a new column like the given *column*""" if dtype is None: @@ -1446,7 +1432,7 @@ def column_empty( ) -> ColumnBase: """Allocate a new column like the given row_count and dtype.""" dtype = cudf.dtype(dtype) - children = () # type: Tuple[ColumnBase, ...] + children: tuple[ColumnBase, ...] = () if isinstance(dtype, StructDtype): data = None @@ -1496,14 +1482,14 @@ def column_empty( def build_column( - data: Union[Buffer, None], + data: Buffer | None, dtype: Dtype, *, - size: Optional[int] = None, - mask: Optional[Buffer] = None, + size: int | None = None, + mask: Buffer | None = None, offset: int = 0, - null_count: Optional[int] = None, - children: Tuple[ColumnBase, ...] = (), + null_count: int | None = None, + children: tuple[ColumnBase, ...] = (), ) -> ColumnBase: """ Build a Column of the appropriate type from the given parameters @@ -1665,10 +1651,10 @@ def build_column( def build_categorical_column( categories: ColumnBase, codes: ColumnBase, - mask: Optional[Buffer] = None, - size: Optional[int] = None, + mask: Buffer | None = None, + size: int | None = None, offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ordered: bool = False, ) -> "cudf.core.column.CategoricalColumn": """ @@ -1715,7 +1701,7 @@ def check_invalid_array(shape: tuple, dtype): raise TypeError("Unsupported type float16") -def as_memoryview(arbitrary: Any) -> Optional[memoryview]: +def as_memoryview(arbitrary: Any) -> memoryview | None: try: return memoryview(arbitrary) except TypeError: @@ -1724,9 +1710,9 @@ def as_memoryview(arbitrary: Any) -> Optional[memoryview]: def as_column( arbitrary: Any, - nan_as_null: Optional[bool] = None, - dtype: Optional[Dtype] = None, - length: Optional[int] = None, + nan_as_null: bool | None = None, + dtype: Dtype | None = None, + length: int | None = None, ): """Create a Column from an arbitrary object @@ -2199,7 +2185,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer: raise NotImplementedError(f"Cannot infer mask from typestr {typestr}") -def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]: +def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]: """ Return the headers and frames resulting from serializing a list of Column @@ -2216,7 +2202,7 @@ def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]: frames : list list of frames """ - headers: List[Dict[Any, Any]] = [] + headers: list[dict[Any, Any]] = [] frames = [] if len(columns) > 0: @@ -2228,7 +2214,7 @@ def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]: return headers, frames -def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]: +def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]: """ Construct a list of Columns from a list of headers and frames. diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index e24d85bfedf..7fdebda7d76 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -8,7 +8,7 @@ import locale import re from locale import nl_langinfo -from typing import TYPE_CHECKING, Any, Literal, Optional, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, Literal, Sequence, cast import numpy as np import pandas as pd @@ -242,10 +242,10 @@ def __init__( self, data: Buffer, dtype: DtypeObj, - mask: Optional[Buffer] = None, - size: Optional[int] = None, # TODO: make non-optional + mask: Buffer | None = None, + size: int | None = None, # TODO: make non-optional offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ): dtype = cudf.dtype(dtype) if dtype.kind != "M": @@ -499,7 +499,7 @@ def mean( def std( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype: Dtype = np.float64, ddof: int = 1, @@ -511,7 +511,7 @@ def std( * _unit_to_nanoseconds_conversion[self.time_unit], ).as_unit(self.time_unit) - def median(self, skipna: Optional[bool] = None) -> pd.Timestamp: + def median(self, skipna: bool | None = None) -> pd.Timestamp: return pd.Timestamp( self.as_numerical_column("int64").median(skipna=skipna), unit=self.time_unit, @@ -631,7 +631,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): @@ -703,7 +703,7 @@ def _with_type_metadata(self, dtype): def _find_ambiguous_and_nonexistent( self, zone_name: str - ) -> Tuple[NumericalColumn, NumericalColumn] | Tuple[bool, bool]: + ) -> tuple[NumericalColumn, NumericalColumn] | tuple[bool, bool]: """ Recognize ambiguous and nonexistent timestamps for the given timezone. @@ -822,10 +822,10 @@ def __init__( self, data: Buffer, dtype: pd.DatetimeTZDtype, - mask: Optional[Buffer] = None, - size: Optional[int] = None, + mask: Buffer | None = None, + size: int | None = None, offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ): super().__init__( data=data, diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 9c1bedc9926..e9d9b4933e5 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -4,7 +4,7 @@ import warnings from decimal import Decimal -from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast +from typing import TYPE_CHECKING, Any, Sequence, cast import cupy as cp import numpy as np @@ -49,7 +49,7 @@ def __cuda_array_interface__(self): def as_decimal_column( self, dtype: Dtype, - ) -> Union["DecimalBaseColumn"]: + ) -> "DecimalBaseColumn": if ( isinstance(dtype, cudf.core.dtypes.DecimalDtype) and dtype.scale < self.dtype.scale @@ -138,7 +138,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str): def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: """Fill null values with ``value``. @@ -199,7 +199,7 @@ def normalize_binop_value(self, other): return NotImplemented def _decimal_quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + self, q: float | Sequence[float], interpolation: str, exact: bool ) -> ColumnBase: quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q # get sorted indices and exclude nulls diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 080ba949d62..c548db67344 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -3,7 +3,7 @@ from __future__ import annotations from functools import cached_property -from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple, Union +from typing import TYPE_CHECKING, Sequence import numpy as np import pandas as pd @@ -167,7 +167,7 @@ def set_base_data(self, value): else: super().set_base_data(value) - def set_base_children(self, value: Tuple[ColumnBase, ...]): + def set_base_children(self, value: tuple[ColumnBase, ...]): super().set_base_children(value) _, values = value self._dtype = cudf.ListDtype(element_type=values.dtype) @@ -269,7 +269,7 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self: # as ``self``, but with the leaf column transformed # by applying ``func`` to it - cc: List[ListColumn] = [] + cc: list[ListColumn] = [] c: ColumnBase = self while isinstance(c, ListColumn): @@ -320,7 +320,7 @@ def __init__(self, parent: ParentType): def get( self, index: int, - default: Optional[Union[ScalarLike, ColumnLike]] = None, + default: ScalarLike | ColumnLike | None = None, ) -> ParentType: """ Extract element at the given index from each list in a Series of lists. @@ -424,7 +424,7 @@ def contains(self, search_key: ScalarLike) -> ParentType: contains_scalar(self._column, cudf.Scalar(search_key)) ) - def index(self, search_key: Union[ScalarLike, ColumnLike]) -> ParentType: + def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: """ Returns integers representing the index of the search key for each row. diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 7f7355c571a..7c6f4e05577 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Optional, Union, overload +from typing import Union, overload from typing_extensions import Literal @@ -52,7 +52,7 @@ def _return_or_inplace( inplace: bool = False, expand: bool = False, retain_index: bool = True, - ) -> Optional[ParentType]: ... + ) -> ParentType | None: ... def _return_or_inplace( self, new_col, inplace=False, expand=False, retain_index=True diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 6af67e02bb4..098cf43421b 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -3,16 +3,7 @@ from __future__ import annotations import functools -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Optional, - Sequence, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Callable, Sequence, cast import cupy as cp import numpy as np @@ -85,10 +76,10 @@ def __init__( self, data: Buffer, dtype: DtypeObj, - mask: Optional[Buffer] = None, - size: Optional[int] = None, # TODO: make this non-optional + mask: Buffer | None = None, + size: int | None = None, # TODO: make this non-optional offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ): dtype = cudf.dtype(dtype) @@ -179,7 +170,7 @@ def __setitem__(self, key: Any, value: Any): else: device_value = device_value.astype(self.dtype) - out: Optional[ColumnBase] # If None, no need to perform mimic inplace. + out: ColumnBase | None # If None, no need to perform mimic inplace. if isinstance(key, slice): out = self._scatter_by_slice(key, device_value) else: @@ -196,7 +187,7 @@ def __setitem__(self, key: Any, value: Any): if out: self._mimic_inplace(out, inplace=True) - def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase: + def unary_operator(self, unaryop: str | Callable) -> ColumnBase: if callable(unaryop): return libcudf.transform.transform(self, unaryop) @@ -302,7 +293,7 @@ def nans_to_nulls(self: Self) -> Self: def normalize_binop_value( self, other: ScalarLike - ) -> Union[ColumnBase, cudf.Scalar]: + ) -> ColumnBase | cudf.Scalar: if isinstance(other, ColumnBase): if not isinstance(other, NumericalColumn): return NotImplemented @@ -422,7 +413,7 @@ def nan_count(self) -> int: def _process_values_for_isin( self, values: Sequence - ) -> Tuple[ColumnBase, ColumnBase]: + ) -> tuple[ColumnBase, ColumnBase]: lhs = cast("cudf.core.column.ColumnBase", self) try: rhs = as_column(values, nan_as_null=False) @@ -456,12 +447,12 @@ def _process_values_for_isin( return lhs, rhs - def _can_return_nan(self, skipna: Optional[bool] = None) -> bool: + def _can_return_nan(self, skipna: bool | None = None) -> bool: return not skipna and self.has_nulls(include_nan=True) def _process_for_reduction( - self, skipna: Optional[bool] = None, min_count: int = 0 - ) -> Union[NumericalColumn, ScalarLike]: + self, skipna: bool | None = None, min_count: int = 0 + ) -> NumericalColumn | ScalarLike: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): @@ -544,7 +535,7 @@ def find_and_replace( def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: """ Fill null values with *fill_value* @@ -730,7 +721,7 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype: def _normalize_find_and_replace_input( - input_column_dtype: DtypeObj, col_to_normalize: Union[ColumnBase, list] + input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list ) -> ColumnBase: normalized_column = column.as_column( col_to_normalize, diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index bd48054a951..95c78c5efcb 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional, cast +from typing import TYPE_CHECKING, cast import numpy as np @@ -42,10 +42,10 @@ class NumericalBaseColumn(ColumnBase, Scannable): "cummax", } - def _can_return_nan(self, skipna: Optional[bool] = None) -> bool: + def _can_return_nan(self, skipna: bool | None = None) -> bool: return not skipna and self.has_nulls() - def kurtosis(self, skipna: Optional[bool] = None) -> float: + def kurtosis(self, skipna: bool | None = None) -> float: skipna = True if skipna is None else skipna if len(self) == 0 or self._can_return_nan(skipna=skipna): @@ -70,7 +70,7 @@ def kurtosis(self, skipna: Optional[bool] = None) -> float: kurt = term_one_section_one * term_one_section_two - 3 * term_two return kurt - def skew(self, skipna: Optional[bool] = None) -> ScalarLike: + def skew(self, skipna: bool | None = None) -> ScalarLike: skipna = True if skipna is None else skipna if len(self) == 0 or self._can_return_nan(skipna=skipna): @@ -142,7 +142,7 @@ def quantile( def mean( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype=np.float64, ): @@ -152,7 +152,7 @@ def mean( def var( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype=np.float64, ddof=1, @@ -163,7 +163,7 @@ def var( def std( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype=np.float64, ddof=1, @@ -172,7 +172,7 @@ def std( "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof ) - def median(self, skipna: Optional[bool] = None) -> NumericalBaseColumn: + def median(self, skipna: bool | None = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 87df2d2f1f1..2451a9cc0af 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5,16 +5,7 @@ import re import warnings from functools import cached_property -from typing import ( - TYPE_CHECKING, - Any, - Optional, - Sequence, - Tuple, - Union, - cast, - overload, -) +from typing import TYPE_CHECKING, Any, Sequence, cast, overload import numpy as np import pandas as pd @@ -257,13 +248,13 @@ def byte_count(self) -> SeriesOrIndex: @overload def cat( - self, sep: Optional[str] = None, na_rep: Optional[str] = None + self, sep: str | None = None, na_rep: str | None = None ) -> str: ... @overload def cat( - self, others, sep: Optional[str] = None, na_rep: Optional[str] = None - ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: ... + self, others, sep: str | None = None, na_rep: str | None = None + ) -> SeriesOrIndex | "cudf.core.column.string.StringColumn": ... def cat(self, others=None, sep=None, na_rep=None): """ @@ -641,7 +632,7 @@ def extract( def contains( self, - pat: Union[str, Sequence], + pat: str | Sequence, case: bool = True, flags: int = 0, na=np.nan, @@ -792,7 +783,7 @@ def contains( result_col = libstrings.contains_multiple(input_column, pat) return self._return_or_inplace(result_col) - def like(self, pat: str, esc: Optional[str] = None) -> SeriesOrIndex: + def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: """ Test if a like pattern matches a string of a Series or Index. @@ -863,7 +854,7 @@ def like(self, pat: str, esc: Optional[str] = None) -> SeriesOrIndex: def repeat( self, - repeats: Union[int, Sequence], + repeats: int | Sequence, ) -> SeriesOrIndex: """ Duplicate each string in the Series or Index. @@ -920,8 +911,8 @@ def repeat( def replace( self, - pat: Union[str, Sequence], - repl: Union[str, Sequence], + pat: str | Sequence, + repl: str | Sequence, n: int = -1, case=None, flags: int = 0, @@ -1074,9 +1065,9 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: def slice( self, - start: Optional[int] = None, - stop: Optional[int] = None, - step: Optional[int] = None, + start: int | None = None, + stop: int | None = None, + step: int | None = None, ) -> SeriesOrIndex: """ Slice substrings from each element in the Series or Index. @@ -2051,7 +2042,7 @@ def istitle(self) -> SeriesOrIndex: return self._return_or_inplace(libstrings.is_title(self._column)) def filter_alphanum( - self, repl: Optional[str] = None, keep: bool = True + self, repl: str | None = None, keep: bool = True ) -> SeriesOrIndex: """ Remove non-alphanumeric characters from strings in this column. @@ -2138,9 +2129,9 @@ def slice_from( def slice_replace( self, - start: Optional[int] = None, - stop: Optional[int] = None, - repl: Optional[str] = None, + start: int | None = None, + stop: int | None = None, + repl: str | None = None, ) -> SeriesOrIndex: """ Replace the specified section of each string with a new string. @@ -2228,9 +2219,7 @@ def slice_replace( ), ) - def insert( - self, start: int = 0, repl: Optional[str] = None - ) -> SeriesOrIndex: + def insert(self, start: int = 0, repl: str | None = None) -> SeriesOrIndex: """ Insert the specified string into each string in the specified position. @@ -2410,10 +2399,10 @@ def get_json_object( def split( self, - pat: Optional[str] = None, + pat: str | None = None, n: int = -1, expand: bool = False, - regex: Optional[bool] = None, + regex: bool | None = None, ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -2578,10 +2567,10 @@ def split( def rsplit( self, - pat: Optional[str] = None, + pat: str | None = None, n: int = -1, expand: bool = False, - regex: Optional[bool] = None, + regex: bool | None = None, ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -3233,7 +3222,7 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: libstrings.rjust(self._column, width, fillchar) ) - def strip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: + def strip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" Remove leading and trailing characters. @@ -3292,7 +3281,7 @@ def strip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: libstrings.strip(self._column, cudf.Scalar(to_strip, "str")) ) - def lstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: + def lstrip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" Remove leading and trailing characters. @@ -3339,7 +3328,7 @@ def lstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: libstrings.lstrip(self._column, cudf.Scalar(to_strip, "str")) ) - def rstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: + def rstrip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" Remove leading and trailing characters. @@ -3844,7 +3833,7 @@ def endswith(self, pat: str) -> SeriesOrIndex: return self._return_or_inplace(result_col) - def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex: + def startswith(self, pat: str | Sequence) -> SeriesOrIndex: """ Test if the start of each string element matches a pattern. @@ -3996,7 +3985,7 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex: return self._return_or_inplace(result) def find( - self, sub: str, start: int = 0, end: Optional[int] = None + self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: """ Return lowest indexes in each strings in the Series/Index @@ -4053,7 +4042,7 @@ def find( return self._return_or_inplace(result_col) def rfind( - self, sub: str, start: int = 0, end: Optional[int] = None + self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: """ Return highest indexes in each strings in the Series/Index @@ -4114,7 +4103,7 @@ def rfind( return self._return_or_inplace(result_col) def index( - self, sub: str, start: int = 0, end: Optional[int] = None + self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: """ Return lowest indexes in each strings where the substring @@ -4176,7 +4165,7 @@ def index( return result def rindex( - self, sub: str, start: int = 0, end: Optional[int] = None + self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: """ Return highest indexes in each strings where the substring @@ -4443,7 +4432,7 @@ def translate(self, table: dict) -> SeriesOrIndex: ) def filter_characters( - self, table: dict, keep: bool = True, repl: Optional[str] = None + self, table: dict, keep: bool = True, repl: str | None = None ) -> SeriesOrIndex: """ Remove characters from each string using the character ranges @@ -4924,7 +4913,7 @@ def ngrams_tokenize( ) def replace_tokens( - self, targets, replacements, delimiter: Optional[str] = None + self, targets, replacements, delimiter: str | None = None ) -> SeriesOrIndex: """ The targets tokens are searched for within each string in the series @@ -5009,8 +4998,8 @@ def replace_tokens( def filter_tokens( self, min_token_length: int, - replacement: Optional[str] = None, - delimiter: Optional[str] = None, + replacement: str | None = None, + delimiter: str | None = None, ) -> SeriesOrIndex: """ Remove tokens from within each string in the series that are @@ -5279,7 +5268,7 @@ def edit_distance_matrix(self) -> SeriesOrIndex: ) def minhash( - self, seeds: Optional[ColumnLike] = None, width: int = 4 + self, seeds: ColumnLike | None = None, width: int = 4 ) -> SeriesOrIndex: """ Compute the minhash of a strings column. @@ -5322,7 +5311,7 @@ def minhash( ) def minhash64( - self, seeds: Optional[ColumnLike] = None, width: int = 4 + self, seeds: ColumnLike | None = None, width: int = 4 ) -> SeriesOrIndex: """ Compute the minhash of a strings column. @@ -5436,8 +5425,8 @@ class StringColumn(column.ColumnBase): respectively """ - _start_offset: Optional[int] - _end_offset: Optional[int] + _start_offset: int | None + _end_offset: int | None _VALID_BINARY_OPERATIONS = { "__eq__", @@ -5461,12 +5450,12 @@ class StringColumn(column.ColumnBase): def __init__( self, - data: Optional[Buffer] = None, - mask: Optional[Buffer] = None, - size: Optional[int] = None, # TODO: make non-optional + data: Buffer | None = None, + mask: Buffer | None = None, + size: int | None = None, # TODO: make non-optional offset: int = 0, - null_count: Optional[int] = None, - children: Tuple["column.ColumnBase", ...] = (), + null_count: int | None = None, + children: tuple["column.ColumnBase", ...] = (), ): dtype = cudf.api.types.dtype("object") @@ -5634,8 +5623,8 @@ def to_arrow(self) -> pa.Array: def sum( self, - skipna: Optional[bool] = None, - dtype: Optional[Dtype] = None, + skipna: bool | None = None, + dtype: Dtype | None = None, min_count: int = 0, ): result_col = self._process_for_reduction( @@ -5852,7 +5841,7 @@ def find_and_replace( def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: if fill_value is not None: if not is_scalar(fill_value): @@ -5864,9 +5853,7 @@ def fillna( fill_value = cudf.Scalar(fill_value, dtype=self.dtype) return super().fillna(fill_value, method=method) - def normalize_binop_value( - self, other - ) -> Union[column.ColumnBase, cudf.Scalar]: + def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar: if ( isinstance(other, (column.ColumnBase, cudf.Scalar)) and other.dtype == "object" @@ -5930,8 +5917,8 @@ def _binaryop( # Explicit types are necessary because mypy infers ColumnBase # rather than StringColumn and sometimes forgets Scalar. - lhs: Union[cudf.Scalar, StringColumn] - rhs: Union[cudf.Scalar, StringColumn] + lhs: cudf.Scalar | StringColumn + rhs: cudf.Scalar | StringColumn lhs, rhs = (other, self) if reflect else (self, other) return cast( diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 0af847f38af..8eec84b64f7 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -4,7 +4,7 @@ import datetime import functools -from typing import TYPE_CHECKING, Any, Optional, Sequence, cast +from typing import TYPE_CHECKING, Any, Sequence, cast import numpy as np import pandas as pd @@ -77,10 +77,10 @@ def __init__( self, data: Buffer, dtype: Dtype, - size: Optional[int] = None, # TODO: make non-optional - mask: Optional[Buffer] = None, + size: int | None = None, # TODO: make non-optional + mask: Buffer | None = None, offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ): dtype = cudf.dtype(dtype) if dtype.kind != "m": @@ -255,7 +255,7 @@ def time_unit(self) -> str: def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): @@ -316,7 +316,7 @@ def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: unit=self.time_unit, ).as_unit(self.time_unit) - def median(self, skipna: Optional[bool] = None) -> pd.Timedelta: + def median(self, skipna: bool | None = None) -> pd.Timedelta: return pd.Timedelta( self.as_numerical_column("int64").median(skipna=skipna), unit=self.time_unit, @@ -346,9 +346,9 @@ def quantile( def sum( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, - dtype: Optional[Dtype] = None, + dtype: Dtype | None = None, ) -> pd.Timedelta: return pd.Timedelta( # Since sum isn't overridden in Numerical[Base]Column, mypy only @@ -362,7 +362,7 @@ def sum( def std( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype: Dtype = np.float64, ddof: int = 1, diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 9f3de061ee8..1bf9a393566 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -6,16 +6,7 @@ import sys from collections import abc from functools import cached_property, reduce -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Mapping, - Optional, - Tuple, - Union, -) +from typing import TYPE_CHECKING, Any, Callable, Mapping import numpy as np import pandas as pd @@ -98,13 +89,13 @@ class ColumnAccessor(abc.MutableMapping): column length and type """ - _data: "Dict[Any, ColumnBase]" + _data: "dict[Any, ColumnBase]" multiindex: bool - _level_names: Tuple[Any, ...] + _level_names: tuple[Any, ...] def __init__( self, - data: Union[abc.MutableMapping, ColumnAccessor, None] = None, + data: abc.MutableMapping | ColumnAccessor | None = None, multiindex: bool = False, level_names=None, rangeindex: bool = False, @@ -210,7 +201,7 @@ def _from_columns_like_self( ) @property - def level_names(self) -> Tuple[Any, ...]: + def level_names(self) -> tuple[Any, ...]: if self._level_names is None or len(self._level_names) == 0: return tuple((None,) * max(1, self.nlevels)) else: @@ -237,11 +228,11 @@ def nrows(self) -> int: return len(next(iter(self.values()))) @cached_property - def names(self) -> Tuple[Any, ...]: + def names(self) -> tuple[Any, ...]: return tuple(self.keys()) @cached_property - def columns(self) -> Tuple[ColumnBase, ...]: + def columns(self) -> tuple[ColumnBase, ...]: return tuple(self.values()) @cached_property @@ -610,7 +601,7 @@ def _pad_key(self, key: Any, pad_value="") -> Any: return key + (pad_value,) * (self.nlevels - len(key)) def rename_levels( - self, mapper: Union[Mapping[Any, Any], Callable], level: Optional[int] + self, mapper: Mapping[Any, Any] | Callable, level: int | None ) -> ColumnAccessor: """ Rename the specified levels of the given ColumnAccessor diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d8d46a6df73..065b13561ab 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -14,20 +14,7 @@ import warnings from collections import abc, defaultdict from collections.abc import Iterator -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Literal, - MutableMapping, - Optional, - Set, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Callable, Literal, MutableMapping, cast import cupy import numba @@ -684,7 +671,7 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): """ _PROTECTED_KEYS = frozenset(("_data", "_index")) - _accessors: Set[Any] = set() + _accessors: set[Any] = set() _loc_indexer_type = _DataFrameLocIndexer _iloc_indexer_type = _DataFrameIlocIndexer _groupby = DataFrameGroupBy @@ -1123,7 +1110,7 @@ def _init_from_dict_like( def _from_data( cls, data: MutableMapping, - index: Optional[BaseIndex] = None, + index: BaseIndex | None = None, columns: Any = None, ) -> DataFrame: out = super()._from_data(data=data, index=index) @@ -1553,7 +1540,7 @@ def _get_numeric_data(self): return self[columns] @_cudf_nvtx_annotate - def assign(self, **kwargs: Union[Callable[[Self], Any], Any]): + def assign(self, **kwargs: Callable[[Self], Any] | Any): """ Assign columns to DataFrame from keyword arguments. @@ -2009,12 +1996,10 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - ) -> Tuple[ - Union[ - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], - NotImplementedType, - ], - Optional[BaseIndex], + ) -> tuple[ + dict[str | None, tuple[ColumnBase, Any, bool, Any]] + | NotImplementedType, + BaseIndex | None, bool, ]: lhs, rhs = self._data, other @@ -2119,8 +2104,8 @@ def from_dict( cls, data: dict, orient: str = "columns", - dtype: Optional[Dtype] = None, - columns: Optional[list] = None, + dtype: Dtype | None = None, + columns: list | None = None, ) -> DataFrame: """ Construct DataFrame from dict of array-like or dicts. @@ -4584,7 +4569,7 @@ def apply( def applymap( self, func: Callable[[Any], Any], - na_action: Union[str, None] = None, + na_action: str | None = None, **kwargs, ) -> DataFrame: """ @@ -4617,7 +4602,7 @@ def applymap( def map( self, func: Callable[[Any], Any], - na_action: Union[str, None] = None, + na_action: str | None = None, **kwargs, ) -> DataFrame: """ @@ -7498,7 +7483,7 @@ def nunique(self, axis=0, dropna: bool = True) -> Series: def _sample_axis_1( self, n: int, - weights: Optional[ColumnLike], + weights: ColumnLike | None, replace: bool, random_state: np.random.RandomState, ignore_index: bool, @@ -7523,11 +7508,11 @@ def _sample_axis_1( def _from_columns_like_self( self, - columns: List[ColumnBase], - column_names: Optional[abc.Iterable[str]] = None, - index_names: Optional[List[str]] = None, + columns: list[ColumnBase], + column_names: abc.Iterable[str] | None = None, + index_names: list[str] | None = None, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ) -> DataFrame: result = super()._from_columns_like_self( columns, @@ -8128,7 +8113,7 @@ def _setitem_with_dataframe( input_df: DataFrame, replace_df: DataFrame, input_cols: Any = None, - mask: Optional[ColumnBase] = None, + mask: ColumnBase | None = None, ignore_index: bool = False, ): """ diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 62ded8ac6f1..9cd573aceb9 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -1,17 +1,9 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from __future__ import annotations import enum from collections import abc -from typing import ( - Any, - Dict, - Iterable, - Mapping, - Optional, - Sequence, - Tuple, - cast, -) +from typing import Any, Iterable, Mapping, Sequence, Tuple, cast import cupy as cp import numpy as np @@ -109,7 +101,7 @@ def __dlpack__(self): except ValueError: raise TypeError(f"dtype {self._dtype} unsupported by `dlpack`") - def __dlpack_device__(self) -> Tuple[_Device, int]: + def __dlpack_device__(self) -> tuple[_Device, int]: """ _Device type and _Device ID for where the data in the buffer resides. """ @@ -265,7 +257,7 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype: return (kind, bitwidth, format_str, endianness) @property - def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]: + def describe_categorical(self) -> tuple[bool, bool, dict[int, Any]]: """ If the dtype is categorical, there are two options: @@ -298,7 +290,7 @@ def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]: return ordered, is_dictionary, mapping @property - def describe_null(self) -> Tuple[int, Any]: + def describe_null(self) -> tuple[int, Any]: """ Return the missing value (or "null") representation the column dtype uses, as a tuple ``(kind, value)``. @@ -338,7 +330,7 @@ def null_count(self) -> int: return self._col.null_count @property - def metadata(self) -> Dict[str, Any]: + def metadata(self) -> dict[str, Any]: """ Store specific metadata of the column. """ @@ -351,7 +343,7 @@ def num_chunks(self) -> int: return 1 def get_chunks( - self, n_chunks: Optional[int] = None + self, n_chunks: int | None = None ) -> Iterable["_CuDFColumn"]: """ Return an iterable yielding the chunks. @@ -362,7 +354,7 @@ def get_chunks( def get_buffers( self, - ) -> Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]]: + ) -> Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None]: """ Return a dictionary containing the underlying buffers. @@ -400,7 +392,7 @@ def get_buffers( def _get_validity_buffer( self, - ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: + ) -> tuple[_CuDFBuffer, ProtoDtype] | None: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. @@ -433,7 +425,7 @@ def _get_validity_buffer( def _get_offsets_buffer( self, - ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: + ) -> tuple[_CuDFBuffer, ProtoDtype] | None: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) @@ -461,7 +453,7 @@ def _get_offsets_buffer( def _get_data_buffer( self, - ) -> Tuple[_CuDFBuffer, ProtoDtype]: + ) -> tuple[_CuDFBuffer, ProtoDtype]: """ Return the buffer containing the data and the buffer's associated dtype. @@ -588,7 +580,7 @@ def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame": ) def get_chunks( - self, n_chunks: Optional[int] = None + self, n_chunks: int | None = None ) -> Iterable["_CuDFDataFrame"]: """ Return an iterator yielding the chunks. @@ -745,9 +737,9 @@ def from_dataframe( def _protocol_to_cudf_column_numeric( col, allow_copy: bool -) -> Tuple[ +) -> tuple[ cudf.core.column.ColumnBase, - Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], + Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None], ]: """ Convert an int, uint, float or bool protocol column @@ -822,9 +814,9 @@ def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype: def _protocol_to_cudf_column_categorical( col, allow_copy: bool -) -> Tuple[ +) -> tuple[ cudf.core.column.ColumnBase, - Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], + Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None], ]: """ Convert a categorical column to a Series instance @@ -857,9 +849,9 @@ def _protocol_to_cudf_column_categorical( def _protocol_to_cudf_column_string( col, allow_copy: bool -) -> Tuple[ +) -> tuple[ cudf.core.column.ColumnBase, - Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], + Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None], ]: """ Convert a string ColumnObject to cudf Column object. diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index b1282040e60..034849d0e71 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import decimal import operator @@ -6,7 +7,7 @@ import textwrap import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Callable import numpy as np import pandas as pd @@ -16,12 +17,12 @@ from pandas.core.arrays.arrow.extension_types import ArrowIntervalType import cudf -from cudf._typing import Dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.utils.docutils import doc_apply if TYPE_CHECKING: + from cudf._typing import Dtype from cudf.core.buffer import Buffer @@ -84,11 +85,11 @@ def dtype(arbitrary): def _decode_type( - cls: Type, + cls: type, header: dict, frames: list, - is_valid_class: Callable[[Type, Type], bool] = operator.is_, -) -> Tuple[dict, list, Type]: + is_valid_class: Callable[[type, type], bool] = operator.is_, +) -> tuple[dict, list, type]: """Decode metadata-encoded type and check validity Parameters @@ -481,8 +482,8 @@ def __repr__(self): def __hash__(self): return hash(self._typ) - def serialize(self) -> Tuple[dict, list]: - header: Dict[str, Dtype] = {} + def serialize(self) -> tuple[dict, list]: + header: dict[str, Dtype] = {} header["type-serialized"] = pickle.dumps(type(self)) frames = [] @@ -627,13 +628,13 @@ def __repr__(self): def __hash__(self): return hash(self._typ) - def serialize(self) -> Tuple[dict, list]: - header: Dict[str, Any] = {} + def serialize(self) -> tuple[dict, list]: + header: dict[str, Any] = {} header["type-serialized"] = pickle.dumps(type(self)) - frames: List[Buffer] = [] + frames: list[Buffer] = [] - fields: Dict[str, Union[bytes, Tuple[Any, Tuple[int, int]]]] = {} + fields: dict[str, bytes | tuple[Any, tuple[int, int]]] = {} for k, dtype in self.fields.items(): if isinstance(dtype, _BaseDtype): @@ -823,7 +824,7 @@ def _from_decimal(cls, decimal): precision = max(len(metadata.digits), -metadata.exponent) return cls(precision, -metadata.exponent) - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: return ( { "type-serialized": pickle.dumps(type(self)), @@ -946,7 +947,7 @@ def __eq__(self, other): def __hash__(self): return hash((self.subtype, self.closed)) - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: header = { "type-serialized": pickle.dumps(type(self)), "fields": pickle.dumps((self.subtype, self.closed)), diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6a1ef05b1f9..c58a0161ee0 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -8,18 +8,7 @@ import pickle import warnings from collections import abc -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Literal, - MutableMapping, - Optional, - Tuple, - Union, -) +from typing import TYPE_CHECKING, Any, Callable, Literal, MutableMapping # TODO: The `numpy` import is needed for typing purposes during doc builds # only, need to figure out why the `np` alias is insufficient then remove. @@ -83,11 +72,11 @@ def _num_rows(self) -> int: return self._data.nrows @property - def _column_names(self) -> Tuple[Any, ...]: + def _column_names(self) -> tuple[Any, ...]: return self._data.names @property - def _columns(self) -> Tuple[ColumnBase, ...]: + def _columns(self) -> tuple[ColumnBase, ...]: return self._data.columns @property @@ -154,10 +143,10 @@ def _from_data_like_self(self, data: MutableMapping) -> Self: @_cudf_nvtx_annotate def _from_columns_like_self( self, - columns: List[ColumnBase], - column_names: Optional[abc.Iterable[str]] = None, + columns: list[ColumnBase], + column_names: abc.Iterable[str] | None = None, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ): """Construct a Frame from a list of columns with metadata from self. @@ -172,7 +161,7 @@ def _from_columns_like_self( @_cudf_nvtx_annotate def _mimic_inplace( self, result: Self, inplace: bool = False - ) -> Optional[Self]: + ) -> Self | None: if inplace: for col in self._data: if col in result._data: @@ -424,15 +413,15 @@ def _to_array( get_array: Callable, module: ModuleType, copy: bool, - dtype: Union[Dtype, None] = None, + dtype: Dtype | None = None, na_value=None, - ) -> Union[cupy.ndarray, numpy.ndarray]: + ) -> cupy.ndarray | numpy.ndarray: # Internal function to implement to_cupy and to_numpy, which are nearly # identical except for the attribute they access to generate values. def to_array( col: ColumnBase, dtype: np.dtype - ) -> Union[cupy.ndarray, numpy.ndarray]: + ) -> cupy.ndarray | numpy.ndarray: if na_value is not None: col = col.fillna(na_value) array = get_array(col) @@ -485,7 +474,7 @@ def to_array( @_cudf_nvtx_annotate def to_cupy( self, - dtype: Union[Dtype, None] = None, + dtype: Dtype | None = None, copy: bool = False, na_value=None, ) -> cupy.ndarray: @@ -519,7 +508,7 @@ def to_cupy( @_cudf_nvtx_annotate def to_numpy( self, - dtype: Union[Dtype, None] = None, + dtype: Dtype | None = None, copy: bool = True, na_value=None, ) -> numpy.ndarray: @@ -552,7 +541,7 @@ def to_numpy( ) @_cudf_nvtx_annotate - def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: + def where(self, cond, other=None, inplace: bool = False) -> Self | None: """ Replace values where the condition is False. @@ -628,11 +617,11 @@ def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: def fillna( self, value=None, - method: Optional[Literal["ffill", "bfill", "pad", "backfill"]] = None, + method: Literal["ffill", "bfill", "pad", "backfill"] | None = None, axis=None, inplace: bool = False, limit=None, - ) -> Optional[Self]: + ) -> Self | None: """Fill null values with ``value`` or specified ``method``. Parameters @@ -1047,7 +1036,7 @@ def _copy_type_metadata( self, other: Self, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ) -> Self: """ Copy type metadata from each column of `other` to the corresponding @@ -1495,7 +1484,7 @@ def _unaryop(self, op): @_cudf_nvtx_annotate def _colwise_binop( cls, - operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], + operands: dict[str | None, tuple[ColumnBase, Any, bool, Any]], fn: str, ): """Implement binary ops between two frame-like objects. @@ -1910,8 +1899,8 @@ def nunique(self, dropna: bool = True): @staticmethod @_cudf_nvtx_annotate def _repeat( - columns: List[ColumnBase], repeats, axis=None - ) -> List[ColumnBase]: + columns: list[ColumnBase], repeats, axis=None + ) -> list[ColumnBase]: if axis is not None: raise NotImplementedError( "Only axis=`None` supported at this time." diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index aa96051ea51..d08268eea3a 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import copy import itertools @@ -7,7 +8,7 @@ import warnings from collections import abc from functools import cached_property -from typing import Any, Iterable, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Iterable import cupy as cp import numpy as np @@ -20,7 +21,6 @@ from cudf._lib.reshape import interleave_columns from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype -from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 @@ -34,6 +34,9 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import GetAttrGetItemMixin +if TYPE_CHECKING: + from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType + def _deprecate_collect(): warnings.warn( @@ -1033,11 +1036,11 @@ def ngroup(self, ascending=True): def sample( self, - n: Optional[int] = None, - frac: Optional[float] = None, + n: int | None = None, + frac: float | None = None, replace: bool = False, - weights: Union[abc.Sequence, "cudf.Series", None] = None, - random_state: Union[np.random.RandomState, int, None] = None, + weights: abc.Sequence | "cudf.Series" | None = None, + random_state: np.random.RandomState | int | None = None, ): """Return a random sample of items in each group. @@ -1222,7 +1225,7 @@ def _grouped(self, *, include_groups: bool = True): def _normalize_aggs( self, aggs: MultiColumnAggType - ) -> Tuple[Iterable[Any], Tuple[ColumnBase, ...], List[List[AggType]]]: + ) -> tuple[Iterable[Any], tuple[ColumnBase, ...], list[list[AggType]]]: """ Normalize aggs to a list of list of aggregations, where `out[i]` is a list of aggregations for column `self.obj[i]`. We support three @@ -1237,7 +1240,7 @@ def _normalize_aggs( Each agg can be string or lambda functions. """ - aggs_per_column: Iterable[Union[AggType, Iterable[AggType]]] + aggs_per_column: Iterable[AggType | Iterable[AggType]] if isinstance(aggs, dict): column_names, aggs_per_column = aggs.keys(), aggs.values() columns = tuple(self.obj._data[col] for col in column_names) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 11d09e470ff..13fa187842d 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -7,17 +7,7 @@ import warnings from functools import cache, cached_property from numbers import Number -from typing import ( - TYPE_CHECKING, - Any, - List, - Literal, - MutableMapping, - Optional, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast import cupy import numpy as np @@ -101,10 +91,10 @@ def __subclasscheck__(self, subclass): def _lexsorted_equal_range( - idx: Union[Index, cudf.MultiIndex], + idx: Index | cudf.MultiIndex, key_as_table: Frame, is_sorted: bool, -) -> Tuple[int, int, Optional[ColumnBase]]: +) -> tuple[int, int, ColumnBase | None]: """Get equal range for key in lexicographically sorted index. If index is not sorted when called, a sort will take place and `sort_inds` is returned. Otherwise `None` is returned in that position. @@ -2858,7 +2848,7 @@ class IntervalIndex(Index): def __init__( self, data, - closed: Optional[Literal["left", "right", "neither", "both"]] = None, + closed: Literal["left", "right", "neither", "both"] | None = None, dtype=None, copy: bool = False, name=None, @@ -2917,9 +2907,7 @@ def closed(self): def from_breaks( cls, breaks, - closed: Optional[ - Literal["left", "right", "neither", "both"] - ] = "right", + closed: Literal["left", "right", "neither", "both"] | None = "right", name=None, copy: bool = False, dtype=None, @@ -3106,7 +3094,7 @@ def _getdefault_name(values, name): @_cudf_nvtx_annotate -def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: +def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex: """ An internal Utility function to concat RangeIndex objects. """ @@ -3147,7 +3135,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: @_cudf_nvtx_annotate -def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]: +def _extended_gcd(a: int, b: int) -> tuple[int, int, int]: """ Extended Euclidean algorithms to solve Bezout's identity: a*x + b*y = gcd(x, y) @@ -3197,7 +3185,7 @@ def _get_nearest_indexer( index: Index, positions: cudf.Series, target_col: cudf.core.column.ColumnBase, - tolerance: Union[int, float], + tolerance: int | float, ): """ Get the indexer for the nearest index labels; requires an index with diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3a4f4874e35..06da62306e8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -12,15 +12,9 @@ TYPE_CHECKING, Any, Callable, - Dict, - List, Literal, MutableMapping, - Optional, - Tuple, - Type, TypeVar, - Union, cast, ) from uuid import uuid4 @@ -258,8 +252,8 @@ class IndexedFrame(Frame): """ # mypy can't handle bound type variables as class members - _loc_indexer_type: Type[_LocIndexerClass] # type: ignore - _iloc_indexer_type: Type[_IlocIndexerClass] # type: ignore + _loc_indexer_type: type[_LocIndexerClass] # type: ignore + _iloc_indexer_type: type[_IlocIndexerClass] # type: ignore _index: cudf.core.index.BaseIndex _groupby = GroupBy _resampler = _Resampler @@ -294,14 +288,14 @@ def _num_rows(self) -> int: return len(self.index) @property - def _index_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? + def _index_names(self) -> tuple[Any, ...]: # TODO: Tuple[str]? return self.index._data.names @classmethod def _from_data( cls, data: MutableMapping, - index: Optional[BaseIndex] = None, + index: BaseIndex | None = None, ): out = super()._from_data(data) out._index = RangeIndex(out._data.nrows) if index is None else index @@ -316,11 +310,11 @@ def _from_data_like_self(self, data: MutableMapping): @_cudf_nvtx_annotate def _from_columns_like_self( self, - columns: List[ColumnBase], - column_names: Optional[abc.Iterable[str]] = None, - index_names: Optional[List[str]] = None, + columns: list[ColumnBase], + column_names: abc.Iterable[str] | None = None, + index_names: list[str] | None = None, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ) -> Self: """Construct a `Frame` from a list of columns with metadata from self. @@ -368,7 +362,7 @@ def __round__(self, digits=0): def _mimic_inplace( self, result: Self, inplace: bool = False - ) -> Optional[Self]: + ) -> Self | None: if inplace: self._index = result.index return super()._mimic_inplace(result, inplace) @@ -1788,7 +1782,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs): ) @_cudf_nvtx_annotate - def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: + def mask(self, cond, other=None, inplace: bool = False) -> Self | None: """ Replace values where the condition is True. @@ -1924,7 +1918,7 @@ def _copy_type_metadata( other: Self, include_index: bool = True, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ) -> Self: """ Copy type metadata from each column of `other` to the corresponding @@ -4670,9 +4664,9 @@ def sample( def _sample_axis_0( self, n: int, - weights: Optional[ColumnLike], + weights: ColumnLike | None, replace: bool, - random_state: Union[np.random.RandomState, cp.random.RandomState], + random_state: np.random.RandomState | cp.random.RandomState, ignore_index: bool, ): try: @@ -4695,7 +4689,7 @@ def _sample_axis_0( def _sample_axis_1( self, n: int, - weights: Optional[ColumnLike], + weights: ColumnLike | None, replace: bool, random_state: np.random.RandomState, ignore_index: bool, @@ -4742,12 +4736,10 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - ) -> Tuple[ - Union[ - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], - NotImplementedType, - ], - Optional[cudf.BaseIndex], + ) -> tuple[ + dict[str | None, tuple[ColumnBase, Any, bool, Any]] + | NotImplementedType, + cudf.BaseIndex | None, bool, ]: raise NotImplementedError( @@ -6328,8 +6320,8 @@ def _check_duplicate_level_names(specified, level_names): @_cudf_nvtx_annotate def _get_replacement_values_for_columns( - to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] -) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]: + to_replace: Any, value: Any, columns_dtype_map: dict[Any, Any] +) -> tuple[dict[Any, bool], dict[Any, Any], dict[Any, Any]]: """ Returns a per column mapping for the values to be replaced, new values to be replaced with and if all the values are empty. @@ -6354,9 +6346,9 @@ def _get_replacement_values_for_columns( A dict mapping of all columns and the corresponding values to be replaced with. """ - to_replace_columns: Dict[Any, Any] = {} - values_columns: Dict[Any, Any] = {} - all_na_columns: Dict[Any, Any] = {} + to_replace_columns: dict[Any, Any] = {} + values_columns: dict[Any, Any] = {} + all_na_columns: dict[Any, Any] = {} if is_scalar(to_replace) and is_scalar(value): to_replace_columns = {col: [to_replace] for col in columns_dtype_map} @@ -6496,8 +6488,8 @@ def _is_series(obj): @_cudf_nvtx_annotate def _drop_rows_by_labels( obj: DataFrameOrSeries, - labels: Union[ColumnLike, abc.Iterable, str], - level: Union[int, str], + labels: ColumnLike | abc.Iterable | str, + level: int | str, errors: str, ) -> DataFrameOrSeries: """Remove rows specified by `labels`. diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 7242de9964f..73a1cd26367 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -1,9 +1,9 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from __future__ import annotations from dataclasses import dataclass -from typing import Any, List, Tuple, Union +from typing import Any, List, Union from typing_extensions import TypeAlias @@ -59,7 +59,7 @@ class ScalarIndexer: def destructure_iloc_key( - key: Any, frame: Union[cudf.Series, cudf.DataFrame] + key: Any, frame: cudf.Series | cudf.DataFrame ) -> tuple[Any, ...]: """ Destructure a potentially tuple-typed key into row and column indexers. @@ -124,7 +124,7 @@ def destructure_iloc_key( def destructure_dataframe_iloc_indexer( key: Any, frame: cudf.DataFrame -) -> Tuple[Any, Tuple[bool, ColumnLabels]]: +) -> tuple[Any, tuple[bool, ColumnLabels]]: """Destructure an index key for DataFrame iloc getitem. Parameters diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 05cbb4429b9..dd0a4f666a1 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -4,7 +4,7 @@ import warnings from collections import abc -from typing import TYPE_CHECKING, Any, Tuple, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np @@ -51,7 +51,7 @@ def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False): def _match_join_keys( lcol: ColumnBase, rcol: ColumnBase, how: str -) -> Tuple[ColumnBase, ColumnBase]: +) -> tuple[ColumnBase, ColumnBase]: # Casts lcol and rcol to a common dtype for use as join keys. If no casting # is necessary, they are returned as is. @@ -133,7 +133,7 @@ def _match_join_keys( def _match_categorical_dtypes_both( lcol: CategoricalColumn, rcol: CategoricalColumn, how: str -) -> Tuple[ColumnBase, ColumnBase]: +) -> tuple[ColumnBase, ColumnBase]: ltype, rtype = lcol.dtype, rcol.dtype # when both are ordered and both have the same categories, diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index da999441ca3..ce81c1fc5b1 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -2,7 +2,7 @@ from __future__ import annotations import itertools -from typing import Any, ClassVar, List, Optional +from typing import Any, ClassVar import cudf from cudf import _lib as libcudf @@ -370,7 +370,7 @@ def _merge_results( else: multiindex_columns = False - index: Optional[cudf.BaseIndex] + index: cudf.BaseIndex | None if self._using_right_index: # right_index and left_on index = left_result.index @@ -398,7 +398,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: # This is taken care of by using a stable sort here, and (in # pandas-compat mode) reordering the gather maps before # producing the input result. - by: List[Any] = [] + by: list[Any] = [] if self._using_left_index and self._using_right_index: by.extend(result.index._data.columns) if not self._using_left_index: diff --git a/python/cudf/cudf/core/mixins/binops.pyi b/python/cudf/cudf/core/mixins/binops.pyi index 8587b2dea48..6be73e25332 100644 --- a/python/cudf/cudf/core/mixins/binops.pyi +++ b/python/cudf/cudf/core/mixins/binops.pyi @@ -1,12 +1,12 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -from typing import Any, Set, Tuple, TypeVar +from typing import Any, TypeVar # Note: It may be possible to define a narrower bound here eventually. BinaryOperandType = TypeVar("BinaryOperandType", bound="Any") class BinaryOperand: - _SUPPORTED_BINARY_OPERATIONS: Set + _SUPPORTED_BINARY_OPERATIONS: set def _binaryop(self, other: BinaryOperandType, op: str): ... def __add__(self, other): ... @@ -36,4 +36,4 @@ class BinaryOperand: def __gt__(self, other): ... def __ge__(self, other): ... @staticmethod - def _check_reflected_op(op) -> Tuple[bool, str]: ... + def _check_reflected_op(op) -> tuple[bool, str]: ... diff --git a/python/cudf/cudf/core/mixins/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi index dbaafdb5cd2..1c2126002ad 100644 --- a/python/cudf/cudf/core/mixins/reductions.pyi +++ b/python/cudf/cudf/core/mixins/reductions.pyi @@ -1,9 +1,7 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -from typing import Set - class Reducible: - _SUPPORTED_REDUCTIONS: Set + _SUPPORTED_REDUCTIONS: set def sum(self): ... def product(self): ... diff --git a/python/cudf/cudf/core/mixins/scans.pyi b/python/cudf/cudf/core/mixins/scans.pyi index 37995241b1f..5190750c698 100644 --- a/python/cudf/cudf/core/mixins/scans.pyi +++ b/python/cudf/cudf/core/mixins/scans.pyi @@ -1,9 +1,7 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -from typing import Set - class Scannable: - _SUPPORTED_SCANS: Set + _SUPPORTED_SCANS: set def cumsum(self): ... def cumprod(self): ... diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 91488e06f4e..832cc003d2e 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -10,7 +10,7 @@ from collections import abc from functools import cached_property from numbers import Integral -from typing import TYPE_CHECKING, Any, List, MutableMapping, Tuple, Union +from typing import TYPE_CHECKING, Any, MutableMapping import cupy as cp import numpy as np @@ -40,7 +40,7 @@ from cudf._typing import DataFrameOrSeries -def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]: +def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray: """Makes best effort to convert an array of indices into a python slice. If the conversion is not possible, return input. `indices` are expected to be valid. @@ -849,9 +849,10 @@ def _index_and_downcast(self, result, index, index_key): def _get_row_major( self, df: DataFrameOrSeries, - row_tuple: Union[ - numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]] - ], + row_tuple: numbers.Number + | slice + | tuple[Any, ...] + | list[tuple[Any, ...]], ) -> DataFrameOrSeries: if pd.api.types.is_bool_dtype( list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple @@ -874,9 +875,10 @@ def _get_row_major( @_cudf_nvtx_annotate def _validate_indexer( self, - indexer: Union[ - numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]] - ], + indexer: numbers.Number + | slice + | tuple[Any, ...] + | list[tuple[Any, ...]], ): if isinstance(indexer, numbers.Number): return diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 53239cb7ea0..903c4fe7df5 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,8 +1,9 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations import itertools import warnings -from typing import Dict, Optional +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -10,13 +11,15 @@ import cudf from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype -from cudf._typing import Dtype from cudf.api.extensions import no_default from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty_like from cudf.core.column.categorical import CategoricalColumn from cudf.utils.dtypes import min_unsigned_type +if TYPE_CHECKING: + from cudf._typing import Dtype + _AXIS_MAP = {0: 0, 1: 1, "index": 0, "columns": 1} @@ -1217,10 +1220,10 @@ def _get_unique(column, dummy_na): def _one_hot_encode_column( column: ColumnBase, categories: ColumnBase, - prefix: Optional[str], - prefix_sep: Optional[str], - dtype: Optional[Dtype], -) -> Dict[str, ColumnBase]: + prefix: str | None, + prefix_sep: str | None, + dtype: Dtype | None, +) -> dict[str, ColumnBase]: """Encode a single column with one hot encoding. The return dictionary contains pairs of (category, encodings). The keys may be prefixed with `prefix`, separated with category name with `prefix_sep`. The encoding diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ebf6910ca5f..e532948fd11 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -9,17 +9,7 @@ import warnings from collections import abc from shutil import get_terminal_size -from typing import ( - TYPE_CHECKING, - Any, - Dict, - Literal, - MutableMapping, - Optional, - Set, - Tuple, - Union, -) +from typing import TYPE_CHECKING, Any, Literal, MutableMapping import cupy import numpy as np @@ -285,7 +275,7 @@ class _SeriesLocIndexer(_FrameIndexer): """ @_cudf_nvtx_annotate - def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]: + def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries: if isinstance(arg, pd.MultiIndex): arg = cudf.from_pandas(arg) @@ -464,7 +454,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable): If ``False``, leaves ``np.nan`` values as is. """ - _accessors: Set[Any] = set() + _accessors: set[Any] = set() _loc_indexer_type = _SeriesLocIndexer _iloc_indexer_type = _SeriesIlocIndexer _groupby = SeriesGroupBy @@ -677,7 +667,7 @@ def __init__( def _from_data( cls, data: MutableMapping, - index: Optional[BaseIndex] = None, + index: BaseIndex | None = None, name: Any = no_default, ) -> Series: out = super()._from_data(data=data, index=index) @@ -1311,7 +1301,7 @@ def map(self, arg, na_action=None) -> "Series": def _getitem_preprocessed( self, spec: indexing_utils.IndexingSpec, - ) -> Union[Self, ScalarLike]: + ) -> Self | ScalarLike: """Get subset of entries given structured data Parameters @@ -1473,12 +1463,10 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - ) -> Tuple[ - Union[ - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], - NotImplementedType, - ], - Optional[BaseIndex], + ) -> tuple[ + dict[str | None, tuple[ColumnBase, Any, bool, Any]] + | NotImplementedType, + BaseIndex | None, bool, ]: # Specialize binops to align indices. diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 43b5dc76f13..23a2c828a04 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any from typing_extensions import Self @@ -274,10 +274,10 @@ def _make_operands_for_binop( other: Any, fill_value: Any = None, reflect: bool = False, - ) -> Union[ - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], - NotImplementedType, - ]: + ) -> ( + dict[str | None, tuple[ColumnBase, Any, bool, Any]] + | NotImplementedType + ): """Generate the dictionary of operands used for a binary operation. Parameters @@ -340,7 +340,7 @@ def nunique(self, dropna: bool = True) -> int: """ return self._column.distinct_count(dropna=dropna) - def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]: + def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase: # A generic method for getting elements from a column that supports a # wide range of different inputs. This method should only used where # _absolutely_ necessary, since in almost all cases a more specific diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index 24c49e3662a..9e59b134b73 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -3,7 +3,6 @@ from __future__ import annotations import warnings -from typing import Union import cupy as cp @@ -60,7 +59,7 @@ def __call__( max_num_rows: int, add_special_tokens: bool = True, padding: str = "max_length", - truncation: Union[bool, str] = False, + truncation: bool | str = False, stride: int = 0, return_tensors: str = "cp", return_token_type_ids: bool = False, diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index f002a838fa9..29130130732 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -1,9 +1,10 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. +from __future__ import annotations import math import re import warnings -from typing import Literal, Optional, Sequence, Union +from typing import Literal, Sequence import cupy as cp import numpy as np @@ -61,7 +62,7 @@ def to_datetime( dayfirst: bool = False, yearfirst: bool = False, utc: bool = False, - format: Optional[str] = None, + format: str | None = None, exact: bool = True, unit: str = "ns", infer_datetime_format: bool = True, @@ -313,7 +314,7 @@ def _process_col( unit: str, dayfirst: bool, infer_datetime_format: bool, - format: Optional[str], + format: str | None, utc: bool, ): if col.dtype.kind == "f": @@ -707,7 +708,7 @@ def _from_freqstr(cls, freqstr: str) -> Self: @classmethod def _from_pandas_ticks_or_weeks( cls, - tick: Union[pd.tseries.offsets.Tick, pd.tseries.offsets.Week], + tick: pd.tseries.offsets.Tick | pd.tseries.offsets.Week, ) -> Self: return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n}) @@ -725,7 +726,7 @@ def _maybe_as_fast_pandas_offset(self): def _isin_datetimelike( - lhs: Union[column.TimeDeltaColumn, column.DatetimeColumn], values: Sequence + lhs: column.TimeDeltaColumn | column.DatetimeColumn, values: Sequence ) -> column.ColumnBase: """ Check whether values are contained in the @@ -784,7 +785,7 @@ def date_range( name=None, closed: Literal["left", "right", "both", "neither"] = "both", *, - unit: Optional[str] = None, + unit: str | None = None, ): """Return a fixed frequency DatetimeIndex. diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 72088493074..dffd7db2f71 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -1,5 +1,7 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. -from typing import Any, Dict +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import Any import numba from numba import cuda, types @@ -124,7 +126,7 @@ def __init__(self, dmm, fe_type): super().__init__(dmm, fe_type, members) -call_cuda_functions: Dict[Any, Any] = {} +call_cuda_functions: dict[Any, Any] = {} def _register_cuda_binary_reduction_caller(funcname, lty, rty, retty): diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index bc1f4f2557e..f1704e4ea78 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -1,8 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import functools import os -from typing import Any, Callable, Dict +from typing import Any, Callable import cachetools import cupy as cp @@ -57,7 +58,7 @@ MASK_BITSIZE = np.dtype("int32").itemsize * 8 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) -launch_arg_getters: Dict[Any, Any] = {} +launch_arg_getters: dict[Any, Any] = {} @functools.cache diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index dbdb2093b72..58b104b84e9 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -10,7 +10,7 @@ from collections import defaultdict from contextlib import ExitStack from functools import partial, reduce -from typing import Callable, Dict, List, Optional, Tuple +from typing import Callable from uuid import uuid4 import numpy as np @@ -679,7 +679,7 @@ def read_parquet( return df -def _normalize_filters(filters: list | None) -> List[List[tuple]] | None: +def _normalize_filters(filters: list | None) -> list[list[tuple]] | None: # Utility to normalize and validate the `filters` # argument to `read_parquet` if not filters: @@ -709,7 +709,7 @@ def _validate_predicate(item): def _apply_post_filters( - df: cudf.DataFrame, filters: List[List[tuple]] | None + df: cudf.DataFrame, filters: list[list[tuple]] | None ) -> cudf.DataFrame: """Apply DNF filters to an in-memory DataFrame @@ -738,7 +738,7 @@ def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series: ) return ~column.isna() if negate else column.isna() - handlers: Dict[str, Callable] = { + handlers: dict[str, Callable] = { "==": operator.eq, "!=": operator.ne, "<": operator.lt, @@ -1311,7 +1311,7 @@ def __init__( ) -> None: if isinstance(path, str) and path.startswith("s3://"): self.fs_meta = {"is_s3": True, "actual_path": path} - self.dir_: Optional[tempfile.TemporaryDirectory] = ( + self.dir_: tempfile.TemporaryDirectory | None = ( tempfile.TemporaryDirectory() ) self.path = self.dir_.name @@ -1328,12 +1328,12 @@ def __init__( self.partition_cols = partition_cols # Collection of `ParquetWriter`s, and the corresponding # partition_col values they're responsible for - self._chunked_writers: List[ - Tuple[libparquet.ParquetWriter, List[str], str] + self._chunked_writers: list[ + tuple[libparquet.ParquetWriter, list[str], str] ] = [] # Map of partition_col values to their ParquetWriter's index # in self._chunked_writers for reverse lookup - self.path_cw_map: Dict[str, int] = {} + self.path_cw_map: dict[str, int] = {} self.storage_options = storage_options self.filename = file_name_prefix self.max_file_size = max_file_size @@ -1345,7 +1345,7 @@ def __init__( ) self.max_file_size = _parse_bytes(max_file_size) - self._file_sizes: Dict[str, int] = {} + self._file_sizes: dict[str, int] = {} @_cudf_nvtx_annotate def write_table(self, df): diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index efa8eabd8b8..fb5a963f008 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -1,11 +1,14 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. +from __future__ import annotations import os import textwrap -from collections.abc import Container from contextlib import ContextDecorator from dataclasses import dataclass -from typing import Any, Callable, Dict, Optional +from typing import TYPE_CHECKING, Any, Callable + +if TYPE_CHECKING: + from collections.abc import Container @dataclass @@ -16,7 +19,7 @@ class Option: validator: Callable -_OPTIONS: Dict[str, Option] = {} +_OPTIONS: dict[str, Option] = {} def _env_get_int(name, default): @@ -123,7 +126,7 @@ def _build_option_description(name, opt): ) -def describe_option(name: Optional[str] = None): +def describe_option(name: str | None = None): """Prints the description of an option. If `name` is unspecified, prints the description of all available options. diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 128913e5746..1540c6850e7 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -12,17 +12,7 @@ import warnings from collections.abc import Iterator from enum import IntEnum -from typing import ( - Any, - Callable, - Dict, - Literal, - Mapping, - Optional, - Set, - Tuple, - Type, -) +from typing import Any, Callable, Literal, Mapping import numpy as np @@ -118,12 +108,12 @@ def make_final_proxy_type( *, fast_to_slow: Callable, slow_to_fast: Callable, - module: Optional[str] = None, + module: str | None = None, additional_attributes: Mapping[str, Any] | None = None, postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None, - bases: Tuple = (), - metaclasses: Tuple = (), -) -> Type[_FinalProxy]: + bases: tuple = (), + metaclasses: tuple = (), +) -> type[_FinalProxy]: """ Defines a fast-slow proxy type for a pair of "final" fast and slow types. Final types are types for which known operations exist for @@ -270,8 +260,8 @@ def make_intermediate_proxy_type( fast_type: type, slow_type: type, *, - module: Optional[str] = None, -) -> Type[_IntermediateProxy]: + module: str | None = None, +) -> type[_IntermediateProxy]: """ Defines a proxy type for a pair of "intermediate" fast and slow types. Intermediate types are the types of the results of @@ -613,13 +603,13 @@ class _IntermediateProxy(_FastSlowProxy): `make_intermediate_proxy_type` to create subtypes. """ - _method_chain: Tuple[Callable, Tuple, Dict] + _method_chain: tuple[Callable, tuple, dict] @classmethod def _fsproxy_wrap( cls, obj: Any, - method_chain: Tuple[Callable, Tuple, Dict], + method_chain: tuple[Callable, tuple, dict], ): """ Parameters @@ -955,7 +945,7 @@ def _fast_slow_function_call( def _transform_arg( arg: Any, attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"], - seen: Set[int], + seen: set[int], ) -> Any: """ Transform "arg" into its corresponding slow (or fast) type. @@ -1052,7 +1042,7 @@ def _fast_arg(arg: Any) -> Any: """ Transform "arg" into its corresponding fast type. """ - seen: Set[int] = set() + seen: set[int] = set() return _transform_arg(arg, "_fsproxy_fast", seen) @@ -1060,7 +1050,7 @@ def _slow_arg(arg: Any) -> Any: """ Transform "arg" into its corresponding slow type. """ - seen: Set[int] = set() + seen: set[int] = set() return _transform_arg(arg, "_fsproxy_slow", seen) @@ -1137,7 +1127,7 @@ def _is_function_or_method(obj: Any) -> bool: def _replace_closurevars( f: types.FunctionType, attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"], - seen: Set[int], + seen: set[int], ) -> Callable[..., Any]: """ Return a copy of `f` with its closure variables replaced with @@ -1199,10 +1189,10 @@ def is_proxy_object(obj: Any) -> bool: return False -NUMPY_TYPES: Set[str] = set(np.sctypeDict.values()) +NUMPY_TYPES: set[str] = set(np.sctypeDict.values()) -_SPECIAL_METHODS: Set[str] = { +_SPECIAL_METHODS: set[str] = { "__abs__", "__add__", "__and__", diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index 1d431c6d882..f82e300e83d 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -17,7 +17,7 @@ from abc import abstractmethod from importlib._bootstrap import _ImportLockContext as ImportLock from types import ModuleType -from typing import Any, ContextManager, Dict, NamedTuple, Tuple +from typing import Any, ContextManager, NamedTuple from typing_extensions import Self @@ -377,7 +377,7 @@ class ModuleAccelerator(ModuleAcceleratorBase): attempts to call the fast version first). """ - _denylist: Tuple[str] + _denylist: tuple[str] _use_fast_lib: bool _use_fast_lib_lock: threading.RLock _module_cache_prefix: str = "_slow_lib_" @@ -519,7 +519,7 @@ def disabled(self): def getattr_real_or_wrapped( name: str, *, - real: Dict[str, Any], + real: dict[str, Any], wrapped_objs, loader: ModuleAccelerator, ) -> Any: diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py index 0dbd333ce4f..0fb41fc0b26 100644 --- a/python/cudf/cudf/pandas/profiler.py +++ b/python/cudf/cudf/pandas/profiler.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import inspect import operator @@ -8,7 +9,6 @@ import sys import time from collections import defaultdict -from typing import Union from rich.console import Console from rich.syntax import Syntax @@ -119,12 +119,10 @@ def __exit__(self, *args, **kwargs): @staticmethod def get_namespaced_function_name( - func_obj: Union[ - _FunctionProxy, - _MethodProxy, - type[_FinalProxy], - type[_IntermediateProxy], - ], + func_obj: _FunctionProxy + | _MethodProxy + | type[_FinalProxy] + | type[_IntermediateProxy], ): if isinstance(func_obj, _MethodProxy): return func_obj._fsproxy_slow.__qualname__ diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 54d38f1a8cf..bf927e661fe 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -1,6 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - -from typing import Optional, Union +from __future__ import annotations import pyarrow as pa import pytest @@ -10,7 +9,7 @@ def metadata_from_arrow_array( pa_array: pa.Array, -) -> Optional[plc.interop.ColumnMetadata]: +) -> plc.interop.ColumnMetadata | None: metadata = None if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype): metadata = plc.interop.ColumnMetadata( @@ -25,7 +24,7 @@ def metadata_from_arrow_array( def assert_column_eq( - lhs: Union[pa.Array, plc.Column], rhs: Union[pa.Array, plc.Column] + lhs: pa.Array | plc.Column, rhs: pa.Array | plc.Column ) -> None: """Verify that a pylibcudf array and PyArrow array are equal.""" # Nested types require children metadata to be passed to the conversion function. diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 0e38b10ed52..238e8d990cc 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import datetime import io import pathlib -from typing import Optional import fastavro import numpy as np @@ -292,7 +293,7 @@ def test_can_detect_dtypes_from_avro_logical_type( assert_eq(expected, actual) -def get_days_from_epoch(date: Optional[datetime.date]) -> Optional[int]: +def get_days_from_epoch(date: datetime.date | None) -> int | None: if date is None: return None return (date - datetime.date(1970, 1, 1)).days diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index a22b678ebe6..8ce4da792a4 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -1,6 +1,7 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from __future__ import annotations -from typing import Any, Tuple +from typing import Any import cupy as cp import pandas as pd @@ -64,7 +65,7 @@ def assert_validity_equal(protocol_buffer, cudf_buffer, size, null, valid): raise NotImplementedError() -def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): +def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol): buf, dtype = buffer_and_dtype device_id = cp.asarray(cudfcol.data).device.id assert buf.__dlpack_device__() == (2, device_id) diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index 913a958b4c2..59b8e6d2e70 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -1,4 +1,5 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. +from __future__ import annotations import contextlib import importlib @@ -7,7 +8,6 @@ import warnings import weakref from concurrent.futures import ThreadPoolExecutor -from typing import List, Tuple import cupy import numpy as np @@ -107,7 +107,7 @@ def single_column_df_base_data(df: cudf.DataFrame) -> SpillableBuffer: gen_df_data_nbytes = single_column_df()._data._data["a"].data.nbytes -def spilled_and_unspilled(manager: SpillManager) -> Tuple[int, int]: +def spilled_and_unspilled(manager: SpillManager) -> tuple[int, int]: """Get bytes spilled and unspilled known by the manager""" spilled = sum(buf.size for buf in manager.buffers() if buf.is_spilled) unspilled = sum( @@ -661,7 +661,7 @@ def test_statistics(manager: SpillManager): def test_statistics_expose(manager: SpillManager): assert len(manager.statistics.spill_totals) == 0 - buffers: List[SpillableBuffer] = [ + buffers: list[SpillableBuffer] = [ as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) for _ in range(10) ] @@ -687,7 +687,7 @@ def test_statistics_expose(manager: SpillManager): assert stat.spilled_nbytes == 0 # Create and spill 10 new buffers - buffers: List[SpillableBuffer] = [ + buffers: list[SpillableBuffer] = [ as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) for _ in range(10) ] diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index d57303ca122..cd7fe5ee023 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -1,7 +1,8 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations import functools -from typing import Any, Dict +from typing import Any import cupy as cp from numba import cuda @@ -339,7 +340,7 @@ def chunk_wise_kernel(nrows, chunks, {args}): return kernel -_cache: Dict[Any, Any] = dict() +_cache: dict[Any, Any] = dict() @functools.wraps(_make_row_wise_kernel) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 239438afd24..78aeac425f7 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -1,8 +1,9 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations import ast import datetime -from typing import Any, Dict +from typing import Any import numpy as np from numba import cuda @@ -114,7 +115,7 @@ def _check_error(tree): raise QuerySyntaxError("too many expressions") -_cache: Dict[Any, Any] = {} +_cache: dict[Any, Any] = {} def query_compile(expr): diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 95621cf9519..2e4dfc4bb14 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -1,11 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import decimal import functools import os import traceback import warnings -from typing import FrozenSet, Set, Union import numpy as np import pandas as pd @@ -218,7 +218,7 @@ class GetAttrGetItemMixin: # `__setstate__`, but this class may be used in complex multiple # inheritance hierarchies that might also override serialization. The # solution here is a minimally invasive change that avoids such conflicts. - _PROTECTED_KEYS: Union[FrozenSet[str], Set[str]] = frozenset() + _PROTECTED_KEYS: frozenset[str] | set[str] = frozenset() def __getattr__(self, key): if key in self._PROTECTED_KEYS: diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py index 39bf07c49de..a75a20a4681 100644 --- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py +++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import inspect from functools import partial diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index ef47ea436c7..2e72461b43d 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations from functools import wraps -from typing import Set import numpy as np import pandas as pd @@ -695,7 +695,7 @@ def _aggs_optimized(arg, supported: set): """Check that aggregations in `arg` are a subset of `supported`""" if isinstance(arg, (list, dict)): if isinstance(arg, dict): - _global_set: Set[str] = set() + _global_set: set[str] = set() for col in arg: if isinstance(arg[col], list): _global_set = _global_set.union(set(arg[col]))