Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement replace in pylibcudf #15005

Merged
merged 7 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ This page provides API documentation for pylibcudf.
reduce
rolling
scalar
replace
table
types
unary
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/replace.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
replace
=======

.. automodule:: cudf._lib.pylibcudf.replace
:members:
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
# the License.
# =============================================================================

set(cython_sources aggregation.pyx binaryop.pyx copying.pyx reduce.pxd types.pyx unary.pyx)
set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd types.pyx
unary.pyx
)

set(linked_libraries cudf::cudf)

Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/_lib/cpp/replace.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar

cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:

cdef enum class replace_policy(bool):
cpdef enum class replace_policy(bool):
PRECEDING
FOLLOWING

Expand Down Expand Up @@ -42,7 +42,6 @@ cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:
column_view source_column,
scalar lo, scalar hi) except +

cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:
cdef unique_ptr[column] normalize_nans_and_zeros(
column_view source_column) except +

Expand Down
Empty file.
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

set(cython_sources
aggregation.pyx binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx groupby.pyx interop.pyx
join.pyx reduce.pyx rolling.pyx scalar.pyx table.pyx types.pyx unary.pyx utils.pyx
join.pyx reduce.pyx replace.pyx rolling.pyx scalar.pyx table.pyx types.pyx unary.pyx utils.pyx
)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ from . cimport (
interop,
join,
reduce,
replace,
rolling,
types,
unary,
Expand All @@ -35,6 +36,7 @@ __all__ = [
"join",
"unary",
"reduce",
"replace",
"rolling",
"types",
]
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
interop,
join,
reduce,
replace,
rolling,
types,
unary,
Expand All @@ -33,6 +34,7 @@
"join",
"unary",
"reduce",
"replace",
"rolling",
"types",
]
36 changes: 36 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/replace.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

from libcpp cimport bool

from cudf._lib.cpp.replace cimport replace_policy

from .column cimport Column
from .scalar cimport Scalar

ctypedef fused ReplacementType:
Column
Scalar
replace_policy
# Allowing object is a workaround for
# https://github.com/cython/cython/issues/5984. See the implementation of
# replace_nulls for details.
object


cpdef Column replace_nulls(Column source_column, ReplacementType replacement)

cpdef Column find_and_replace_all(
Column source_column,
Column values_to_replace,
Column replacement_values,
)

cpdef Column clamp(
Column source_column,
Scalar lo,
Scalar hi,
Scalar lo_replace=*,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this =* syntax mean? This doesn't seem like anything I've seen in Python, C++, or Cython before.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the standard Cython syntax for default arguments in pxd files. The actual default value must be specified in the implementation (the pyx file) while the declaration just indicates that a default exists so that callers know what valid invocations look like. We have this in a couple of places in our existing Cython like column.pxd and scalar.pxd

Scalar hi_replace=*,
)

cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=*)
208 changes: 208 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/replace.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.


from cython.operator import dereference

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.cpp cimport replace as cpp_replace
from cudf._lib.cpp.column.column cimport column

from cudf._lib.cpp.replace import \
replace_policy as ReplacePolicy # no-cython-lint

from .column cimport Column
from .scalar cimport Scalar


cpdef Column replace_nulls(Column source_column, ReplacementType replacement):
"""Replace nulls in source_column.

The values used to replace nulls depends on the type of replacement:
- If replacement is a Column, the corresponding value from replacement
is used.
- If replacement is a Scalar, the same value is used for all nulls.
- If replacement is a replace_policy, the policy is used to determine
the replacement value:

- PRECEDING: The first non-null value that precedes the null is used.
- FOLLOWING: The first non-null value that follows the null is used.

For more details, see :cpp:func:`replace_nulls`.

Parameters
----------
source_column : Column
The column in which to replace nulls.
replacement_column : Union[Column, Scalar, replace_policy]
If a Column, the values to use as replacements. If a Scalar, the value
to use as a replacement. If a replace_policy, the policy to use to
determine the replacement value.

Returns
-------
Column
A copy of source_column with nulls replaced by values from
replacement_column.
"""
cdef unique_ptr[column] c_result
cdef replace_policy policy
# Due to https://github.com/cython/cython/issues/5984, if this function is
# called as a Python function (i.e. without typed inputs, which is always
# true in pure Python files), the type of `replacement` will be `object`
# instead of `replace_policy`. This is a workaround to handle that case.
if ReplacementType is object:
if isinstance(replacement, ReplacePolicy):
policy = replacement
with nogil:
c_result = move(
cpp_replace.replace_nulls(source_column.view(), policy)
)
return Column.from_libcudf(move(c_result))
else:
raise TypeError("replacement must be a Column, Scalar, or replace_policy")

with nogil:
if ReplacementType is Column:
c_result = move(
cpp_replace.replace_nulls(source_column.view(), replacement.view())
)
elif ReplacementType is Scalar:
c_result = move(
cpp_replace.replace_nulls(
source_column.view(), dereference(replacement.c_obj)
)
)
elif ReplacementType is replace_policy:
c_result = move(
cpp_replace.replace_nulls(source_column.view(), replacement)
)
else:
assert False, "Internal error. Please contact pylibcudf developers"
return Column.from_libcudf(move(c_result))


cpdef Column find_and_replace_all(
Column source_column,
Column values_to_replace,
Column replacement_values,
):
"""Replace all occurrences of values_to_replace with replacement_values.

For details, see :cpp:func:`find_and_replace_all`.

Parameters
----------
source_column : Column
The column in which to replace values.
values_to_replace : Column
The column containing values to replace.
replacement_values : Column
The column containing replacement values.

Returns
-------
Column
A copy of source_column with all occurrences of values_to_replace
replaced by replacement_values.
"""
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_replace.find_and_replace_all(
source_column.view(),
values_to_replace.view(),
replacement_values.view(),
)
)
return Column.from_libcudf(move(c_result))


cpdef Column clamp(
Column source_column,
Scalar lo,
Scalar hi,
Scalar lo_replace=None,
Scalar hi_replace=None,
):
"""Clamp the values in source_column to the range [lo, hi].

For details, see :cpp:func:`clamp`.

Parameters
----------
source_column : Column
The column to clamp.
lo : Scalar
The lower bound of the clamp range.
hi : Scalar
The upper bound of the clamp range.
lo_replace : Scalar, optional
The value to use for elements that are less than lo. If not specified,
the value of lo is used.
hi_replace : Scalar, optional
The value to use for elements that are greater than hi. If not
specified, the value of hi is used.

Returns
-------
Column
A copy of source_column with values clamped to the range [lo, hi].
"""
if (lo_replace is None) != (hi_replace is None):
raise ValueError("lo_replace and hi_replace must be specified together")

cdef unique_ptr[column] c_result
with nogil:
if lo_replace is None:
c_result = move(
cpp_replace.clamp(
source_column.view(),
dereference(lo.c_obj),
dereference(hi.c_obj),
)
)
else:
c_result = move(
cpp_replace.clamp(
source_column.view(),
dereference(lo.c_obj),
dereference(hi.c_obj),
dereference(lo_replace.c_obj),
dereference(hi_replace.c_obj),
)
)
return Column.from_libcudf(move(c_result))


cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=False):
"""Normalize NaNs and zeros in source_column.

For details, see :cpp:func:`normalize_nans_and_zeros`.

Parameters
----------
source_column : Column
The column to normalize.
inplace : bool, optional
If True, normalize source_column in place. If False, return a new
column with the normalized values.

Returns
-------
Column
A copy of source_column with NaNs and zeros normalized.
"""
cdef unique_ptr[column] c_result
with nogil:
if inplace:
cpp_replace.normalize_nans_and_zeros(source_column.mutable_view())
else:
c_result = move(
cpp_replace.normalize_nans_and_zeros(source_column.view())
)

if not inplace:
return Column.from_libcudf(move(c_result))
Loading
Loading